diff options
Diffstat (limited to 'net')
677 files changed, 29261 insertions, 10592 deletions
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 1c140af06d52..600b9563bfc5 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -170,7 +170,8 @@ static void lowpan_dev_debugfs_ctx_init(struct net_device *dev, struct dentry *root; char buf[32]; - WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); + if (WARN_ON_ONCE(id >= LOWPAN_IPHC_CTX_TABLE_SIZE)) + return; sprintf(buf, "%d", id); diff --git a/net/802/Makefile b/net/802/Makefile index 19406a87bdaa..bfed80221b8b 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_LLC) += p8022.o psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o obj-$(CONFIG_ATALK) += p8022.o psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o diff --git a/net/802/garp.c b/net/802/garp.c index 400bd857e5f5..f6012f8e59f0 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -203,6 +203,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr kfree(attr); } +static void garp_attr_destroy_all(struct garp_applicant *app) +{ + struct rb_node *node, *next; + struct garp_attr *attr; + + for (node = rb_first(&app->gid); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct garp_attr, node); + garp_attr_destroy(app, attr); + } +} + static int garp_pdu_init(struct garp_applicant *app) { struct sk_buff *skb; @@ -609,6 +622,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); + garp_attr_destroy_all(app); garp_pdu_queue(app); spin_unlock_bh(&app->lock); diff --git a/net/802/mrp.c b/net/802/mrp.c index bea6e43d45a0..35e04cc5390c 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -292,6 +292,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) kfree(attr); } +static void mrp_attr_destroy_all(struct mrp_applicant *app) +{ + struct rb_node *node, *next; + struct mrp_attr *attr; + + for (node = rb_first(&app->mad); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct mrp_attr, node); + mrp_attr_destroy(app, attr); + } +} + static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; @@ -895,6 +908,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); + mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); diff --git a/net/802/p8023.c b/net/802/p8023.c deleted file mode 100644 index 19cd56990db2..000000000000 --- a/net/802/p8023.c +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: 802.3 data link hooks used for IPX 802.3 - * - * 802.3 isn't really a protocol data link layer. Some old IPX stuff - * uses it however. Note that there is only one 802.3 protocol layer - * in the system. We don't currently support different protocols - * running raw 802.3 on different devices. Thankfully nobody else - * has done anything like the old IPX. - */ - -#include <linux/in.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/slab.h> - -#include <net/datalink.h> -#include <net/p8022.h> - -/* - * Place an 802.3 header on a packet. The driver will do the mac - * addresses, we just need to give it the buffer length. - */ -static int p8023_request(struct datalink_proto *dl, - struct sk_buff *skb, unsigned char *dest_node) -{ - struct net_device *dev = skb->dev; - - dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); - return dev_queue_xmit(skb); -} - -/* - * Create an 802.3 client. Note there can be only one 802.3 client - */ -struct datalink_proto *make_8023_client(void) -{ - struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - - if (proto) { - proto->header_length = 0; - proto->request = p8023_request; - } - return proto; -} - -/* - * Destroy the 802.3 client. - */ -void destroy_8023_client(struct datalink_proto *dl) -{ - kfree(dl); -} - -EXPORT_SYMBOL(destroy_8023_client); -EXPORT_SYMBOL(make_8023_client); - -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fb3d3262dc1a..55275ef9a31a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -67,7 +67,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; - array = kzalloc(size, GFP_KERNEL); + array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; @@ -638,7 +638,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) case GET_VLAN_REALDEV_NAME_CMD: err = 0; - vlan_dev_get_realdev_name(dev, args.u.device2); + vlan_dev_get_realdev_name(dev, args.u.device2, + sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index fa3ad3d4d58c..1a705a4ef7fa 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -108,7 +108,8 @@ static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) netdev_features_t ret; ret = real_dev->hw_enc_features & - (NETIF_F_CSUM_MASK | NETIF_F_ALL_TSO | NETIF_F_GSO_ENCAP_ALL); + (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; @@ -129,7 +130,8 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, + size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4db3f0621959..0c21d1fec852 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -239,9 +239,9 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) return 0; } -void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { - strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23); + strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, @@ -360,7 +360,7 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct ifreq ifrr; int err = -EOPNOTSUPP; - strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); + strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { @@ -372,8 +372,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: case SIOCSMIIREG: case SIOCGHWTSTAMP: - if (netif_device_present(real_dev) && ops->ndo_do_ioctl) - err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); + if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) + err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } @@ -814,7 +814,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, - .ndo_do_ioctl = vlan_dev_ioctl, + .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) diff --git a/net/9p/client.c b/net/9p/client.c index b7b958f61faf..213f12ed76cd 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -30,6 +30,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/9p.h> +#define DEFAULT_MSIZE (128 * 1024) + /* * Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options @@ -65,7 +67,7 @@ EXPORT_SYMBOL(p9_is_proto_dotu); int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { - if (clnt->msize != 8192) + if (clnt->msize != DEFAULT_MSIZE) seq_printf(m, ",msize=%u", clnt->msize); seq_printf(m, ",trans=%s", clnt->trans_mod->name); @@ -139,7 +141,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) int ret = 0; clnt->proto_version = p9_proto_2000L; - clnt->msize = 8192; + clnt->msize = DEFAULT_MSIZE; if (!opts) return 0; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index f4dd0456beaf..007bbcc68010 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -34,7 +34,7 @@ #include <linux/syscalls.h> /* killme */ #define P9_PORT 564 -#define MAX_SOCK_BUF (64*1024) +#define MAX_SOCK_BUF (1024*1024) #define MAXPOLLWADDR 2 static struct p9_trans_module p9_tcp_trans; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 93f2f8654882..490a4c900339 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -99,7 +99,7 @@ static unsigned int rest_of_page(void *data) * @client: client instance * * This reclaims a channel by freeing its resources and - * reseting its inuse flag. + * resetting its inuse flag. * */ @@ -463,7 +463,7 @@ req_retry_pinned: * For example TREAD have 11. * 11 is the read/write header = PDU Header(7) + IO Size (4). * Arrange in such a way that server places header in the - * alloced memory and payload onto the user buffer. + * allocated memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); @@ -610,7 +610,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); if (!chan->vc_wq) { err = -ENOMEM; - goto out_free_tag; + goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; @@ -628,6 +628,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) return 0; +out_remove_file: + sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: @@ -760,7 +762,7 @@ static struct p9_trans_module p9_virtio_trans = { .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response - * headers. We also skip one more entry to accomodate, address + * headers. We also skip one more entry to accommodate, address * that are not at page boundary, that can result in an extra * page in zero copy. */ diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index f4fea28e05da..3ec1a51a6944 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -138,7 +138,7 @@ static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) { - struct xen_9pfs_front_priv *priv = NULL; + struct xen_9pfs_front_priv *priv; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; u32 size = p9_req->tc.size; @@ -151,7 +151,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) break; } read_unlock(&xen_9pfs_lock); - if (!priv || priv->client != client) + if (list_entry_is_head(priv, &xen_9pfs_devs, list)) return -EINVAL; num = p9_req->tc.tag % priv->num_rings; diff --git a/net/Kconfig b/net/Kconfig index c7392c449b25..fb13460c6dab 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -363,6 +363,7 @@ source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" source "net/strparser/Kconfig" +source "net/mctp/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9ca9572188fe..fbfeb8a0bb37 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ +obj-$(CONFIG_MCTP) += mctp/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ebda397fa95a..bf5736c1d458 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -666,7 +666,7 @@ static int atif_ioctl(int cmd, void __user *arg) struct rtentry rtdef; int add_route; - if (copy_from_user(&atreq, arg, sizeof(atreq))) + if (get_user_ifreq(&atreq, NULL, arg)) return -EFAULT; dev = __dev_get_by_name(&init_net, atreq.ifr_name); @@ -707,7 +707,7 @@ static int atif_ioctl(int cmd, void __user *arg) /* * Phase 1 is fine on LocalTalk but we don't do - * EtherTalk phase 1. Anyone wanting to add it go ahead. + * EtherTalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -828,7 +828,7 @@ static int atif_ioctl(int cmd, void __user *arg) nr = (struct atalk_netrange *)&(atif->nets); /* * Phase 1 is fine on Localtalk but we don't do - * Ethertalk phase 1. Anyone wanting to add it go ahead. + * Ethertalk phase 1. Anyone wanting to add it, go ahead. */ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) return -EPROTONOSUPPORT; @@ -865,7 +865,7 @@ static int atif_ioctl(int cmd, void __user *arg) return 0; } - return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; + return put_user_ifreq(&atreq, arg); } static int atrtr_ioctl_addrt(struct rtentry *rt) @@ -2018,7 +2018,7 @@ module_init(atalk_init); * by the network device layer. * * Ergo, before the AppleTalk module can be removed, all AppleTalk - * sockets be closed from user space. + * sockets should be closed from user space. */ static void __exit atalk_exit(void) { diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index aa1b57161f3b..0fdbdfd19474 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -11,7 +11,7 @@ #define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev) -static ssize_t show_type(struct device *cdev, +static ssize_t type_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -19,7 +19,7 @@ static ssize_t show_type(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type); } -static ssize_t show_address(struct device *cdev, +static ssize_t address_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -27,7 +27,7 @@ static ssize_t show_address(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi); } -static ssize_t show_atmaddress(struct device *cdev, +static ssize_t atmaddress_show(struct device *cdev, struct device_attribute *attr, char *buf) { unsigned long flags; @@ -50,7 +50,7 @@ static ssize_t show_atmaddress(struct device *cdev, return count; } -static ssize_t show_atmindex(struct device *cdev, +static ssize_t atmindex_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -58,7 +58,7 @@ static ssize_t show_atmindex(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number); } -static ssize_t show_carrier(struct device *cdev, +static ssize_t carrier_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -67,7 +67,7 @@ static ssize_t show_carrier(struct device *cdev, adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); } -static ssize_t show_link_rate(struct device *cdev, +static ssize_t link_rate_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); @@ -90,12 +90,12 @@ static ssize_t show_link_rate(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } -static DEVICE_ATTR(address, 0444, show_address, NULL); -static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL); -static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL); -static DEVICE_ATTR(carrier, 0444, show_carrier, NULL); -static DEVICE_ATTR(type, 0444, show_type, NULL); -static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL); +static DEVICE_ATTR_RO(address); +static DEVICE_ATTR_RO(atmaddress); +static DEVICE_ATTR_RO(atmindex); +static DEVICE_ATTR_RO(carrier); +static DEVICE_ATTR_RO(type); +static DEVICE_ATTR_RO(link_rate); static struct device_attribute *atm_attrs[] = { &dev_attr_atmaddress, diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 3e17a5ecaa94..dd2a8dabed84 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -93,8 +93,8 @@ struct br2684_dev { * This lock should be held for writing any time the list of devices or * their attached vcc's could be altered. It should be held for reading * any time these are being queried. Note that we sometimes need to - * do read-locking under interrupt context, so write locking must block - * the current CPU's interrupts + * do read-locking under interrupting context, so write locking must block + * the current CPU's interrupts. */ static DEFINE_RWLOCK(devs_lock); diff --git a/net/atm/resources.c b/net/atm/resources.c index 53236986dfe0..2b2d33eeaf20 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -52,10 +52,8 @@ static struct atm_dev *__alloc_atm_dev(const char *type) static struct atm_dev *__atm_dev_lookup(int number) { struct atm_dev *dev; - struct list_head *p; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { if (dev->number == number) { atm_dev_hold(dev); return dev; @@ -215,8 +213,7 @@ int atm_getnames(void __user *buf, int __user *iobuf_len) return -ENOMEM; } tmp_p = tmp_buf; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); + list_for_each_entry(dev, &atm_devs, dev_list) { *tmp_p++ = dev->number; } mutex_unlock(&atm_dev_mutex); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index e4f63dd43cb5..36249776c021 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -193,10 +193,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) skb_pull(skb, AX25_KISS_HEADER_LEN); if (digipeat != NULL) { - if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) { - kfree_skb(skb); + if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) goto put; - } skb = ourskb; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index f53751ba81b3..22f2f66c6e0a 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -325,7 +325,6 @@ void ax25_kick(ax25_cb *ax25) void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) { - struct sk_buff *skbn; unsigned char *ptr; int headroom; @@ -336,18 +335,12 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) headroom = ax25_addr_size(ax25->digipeat); - if (skb_headroom(skb) < headroom) { - if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) { + if (unlikely(skb_headroom(skb) < headroom)) { + skb = skb_expand_head(skb, headroom); + if (!skb) { printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n"); - kfree_skb(skb); return; } - - if (skb->sk != NULL) - skb_set_owner_w(skbn, skb->sk); - - consume_skb(skb); - skb = skbn; } ptr = skb_push(skb, headroom); diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b40e0bce67ea..d0b2e094bd55 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -441,24 +441,17 @@ put: struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_address *dest, ax25_digi *digi) { - struct sk_buff *skbn; unsigned char *bp; int len; len = digi->ndigi * AX25_ADDR_LEN; - if (skb_headroom(skb) < len) { - if ((skbn = skb_realloc_headroom(skb, len)) == NULL) { + if (unlikely(skb_headroom(skb) < len)) { + skb = skb_expand_head(skb, len); + if (!skb) { printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n"); return NULL; } - - if (skb->sk != NULL) - skb_set_owner_w(skbn, skb->sk); - - consume_skb(skb); - - skb = skbn; } bp = skb_push(skb, len); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index fc8be49010b9..f94f538fa382 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -519,8 +519,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, } out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return res; } @@ -857,8 +856,7 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) rcu_read_unlock(); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) @@ -1046,14 +1044,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, unlock: rcu_read_unlock(); out: - if (neigh_node) - batadv_neigh_node_put(neigh_node); - if (router) - batadv_neigh_node_put(router); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(neigh_node); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -1194,8 +1188,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, ret = true; out: - if (neigh_node) - batadv_neigh_node_put(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -1496,16 +1489,11 @@ out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - if (router_router) - batadv_neigh_node_put(router_router); - if (orig_neigh_router) - batadv_neigh_node_put(orig_neigh_router); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_node_put(router_router); + batadv_neigh_node_put(orig_neigh_router); + batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } @@ -1851,6 +1839,8 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || @@ -1924,8 +1914,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } out: - if (neigh_node_best) - batadv_neigh_node_put(neigh_node_best); + batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; @@ -2047,10 +2036,8 @@ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, *diff = (int)tq1 - (int)tq2; out: - if (neigh1_ifinfo) - batadv_neigh_ifinfo_put(neigh1_ifinfo); - if (neigh2_ifinfo) - batadv_neigh_ifinfo_put(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } @@ -2080,6 +2067,8 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -2295,8 +2284,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -2310,8 +2298,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) * $routing_class more tq points) */ if (tq_avg > max_tq) { - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -2328,8 +2315,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) next: batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); @@ -2393,14 +2379,10 @@ static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, ret = true; out: - if (router_gw_ifinfo) - batadv_neigh_ifinfo_put(router_gw_ifinfo); - if (router_orig_ifinfo) - batadv_neigh_ifinfo_put(router_orig_ifinfo); - if (router_gw) - batadv_neigh_node_put(router_gw); - if (router_orig) - batadv_neigh_node_put(router_orig); + batadv_neigh_ifinfo_put(router_gw_ifinfo); + batadv_neigh_ifinfo_put(router_orig_ifinfo); + batadv_neigh_node_put(router_gw); + batadv_neigh_node_put(router_orig); return ret; } @@ -2461,6 +2443,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, @@ -2473,12 +2457,9 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, ret = 0; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); + batadv_gw_node_put(curr_gw); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); return ret; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index e1ca2b8c3152..54e41fc709c3 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -106,8 +106,7 @@ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) batadv_v_primary_iface_set(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void @@ -146,6 +145,8 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, @@ -298,6 +299,8 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || @@ -362,8 +365,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } out: - if (neigh_node_best) - batadv_neigh_node_put(neigh_node_best); + batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; @@ -564,10 +566,8 @@ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) ret = 0; out: - if (router) - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(router_ifinfo); return ret; } @@ -595,8 +595,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (curr_gw && bw <= max_bw) goto next; - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); @@ -658,10 +657,8 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, ret = true; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (orig_gw) - batadv_gw_node_put(orig_gw); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(orig_gw); return ret; } @@ -739,6 +736,12 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, goto out; } + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + router->if_incoming->net_dev->ifindex)) { + genlmsg_cancel(msg, hdr); + goto out; + } + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); @@ -754,12 +757,9 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, ret = 0; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); + batadv_gw_node_put(curr_gw); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); return ret; } diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 423c2d171703..71999e13f729 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -486,14 +486,11 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval); hardif_free: - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); neigh_free: - if (neigh) - batadv_neigh_node_put(neigh); + batadv_neigh_node_put(neigh); orig_free: - if (orig_neigh) - batadv_orig_node_put(orig_neigh); + batadv_orig_node_put(orig_neigh); } /** diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index a0a9636d1740..1d750f3cb2e4 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -584,12 +584,9 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, batadv_v_ogm_queue_on_if(skb, if_outgoing); out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); - if (router) - batadv_neigh_node_put(router); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(neigh_ifinfo); } /** @@ -669,10 +666,8 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, else ret = 0; out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } @@ -763,16 +758,11 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: - if (router) - batadv_neigh_node_put(router); - if (orig_neigh_router) - batadv_neigh_node_put(orig_neigh_router); - if (orig_neigh_node) - batadv_orig_node_put(orig_neigh_node); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_node_put(orig_neigh_router); + batadv_orig_node_put(orig_neigh_node); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } @@ -978,12 +968,9 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, } rcu_read_unlock(); out: - if (orig_node) - batadv_orig_node_put(orig_node); - if (neigh_node) - batadv_neigh_node_put(neigh_node); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_orig_node_put(orig_node); + batadv_neigh_node_put(neigh_node); + batadv_hardif_neigh_put(hardif_neigh); } /** diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 7dc133cfc363..1669744304c5 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -162,6 +162,9 @@ static void batadv_backbone_gw_release(struct kref *ref) */ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { + if (!backbone_gw) + return; + kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } @@ -197,6 +200,9 @@ static void batadv_claim_release(struct kref *ref) */ static void batadv_claim_put(struct batadv_bla_claim *claim) { + if (!claim) + return; + kref_put(&claim->refcount, batadv_claim_release); } @@ -395,7 +401,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame - * set HW SRC to the special mac containg the crc + * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -439,8 +445,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, netif_rx_any_context(skb); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1040,7 +1045,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); - /* dont accept claims from gateways which are not in + /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) @@ -1498,8 +1503,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) rcu_read_unlock(); } out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); @@ -1808,8 +1812,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) batadv_hash_destroy(bat_priv->bla.backbone_hash); bat_priv->bla.backbone_hash = NULL; } - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1996,10 +1999,8 @@ handled: ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (claim) - batadv_claim_put(claim); + batadv_hardif_put(primary_if); + batadv_claim_put(claim); return ret; } @@ -2103,10 +2104,8 @@ allow: handled: ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (claim) - batadv_claim_put(claim); + batadv_hardif_put(primary_if); + batadv_claim_put(claim); return ret; } @@ -2271,11 +2270,9 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } @@ -2443,11 +2440,9 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 5c22955bb9d5..8673a265995f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -52,7 +52,6 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); -int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #ifdef CONFIG_BATMAN_ADV_DAT bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 8c95a11a830a..2f008e329007 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -127,6 +127,9 @@ static void batadv_dat_entry_release(struct kref *ref) */ static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) { + if (!dat_entry) + return; + kref_put(&dat_entry->refcount, batadv_dat_entry_release); } @@ -405,8 +408,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid)); out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -594,8 +596,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, continue; max = tmp_max; - if (max_orig_node) - batadv_orig_node_put(max_orig_node); + batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); @@ -981,11 +982,9 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } @@ -1218,8 +1217,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, BATADV_P_DAT_DHT_GET); } out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } @@ -1286,8 +1284,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, ret = true; } out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; @@ -1420,8 +1417,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, out: if (dropped) kfree_skb(skb); - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); /* if dropped == false -> deliver to the interface */ return dropped; } @@ -1830,7 +1826,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, ret = true; out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a5d9d800082b..0899a729a23f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -381,10 +381,8 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, } out: - if (orig_node_dst) - batadv_orig_node_put(orig_node_dst); - if (neigh_node) - batadv_neigh_node_put(neigh_node); + batadv_orig_node_put(orig_node_dst); + batadv_neigh_node_put(neigh_node); return ret; } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 007f2827935d..b7466136e292 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -59,7 +59,7 @@ * after rcu grace period * @ref: kref pointer of the gw_node */ -static void batadv_gw_node_release(struct kref *ref) +void batadv_gw_node_release(struct kref *ref) { struct batadv_gw_node *gw_node; @@ -70,16 +70,6 @@ static void batadv_gw_node_release(struct kref *ref) } /** - * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release - * it - * @gw_node: gateway node to free - */ -void batadv_gw_node_put(struct batadv_gw_node *gw_node) -{ - kref_put(&gw_node->refcount, batadv_gw_node_release); -} - -/** * batadv_gw_get_selected_gw_node() - Get currently selected gateway * @bat_priv: the bat priv with all the soft interface information * @@ -130,8 +120,7 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) unlock: rcu_read_unlock(); out: - if (gw_node) - batadv_gw_node_put(gw_node); + batadv_gw_node_put(gw_node); return orig_node; } @@ -148,8 +137,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, true); - if (curr_gw_node) - batadv_gw_node_put(curr_gw_node); + batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -284,14 +272,10 @@ void batadv_gw_election(struct batadv_priv *bat_priv) batadv_gw_select(bat_priv, next_gw); out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (next_gw) - batadv_gw_node_put(next_gw); - if (router) - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(next_gw); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -325,8 +309,7 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, reselect: batadv_gw_reselect(bat_priv); out: - if (curr_gw_orig) - batadv_orig_node_put(curr_gw_orig); + batadv_orig_node_put(curr_gw_orig); } /** @@ -466,13 +449,11 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); } out: - if (gw_node) - batadv_gw_node_put(gw_node); + batadv_gw_node_put(gw_node); } /** @@ -555,10 +536,8 @@ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } @@ -780,15 +759,10 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, batadv_neigh_ifinfo_put(old_ifinfo); out: - if (orig_dst_node) - batadv_orig_node_put(orig_dst_node); - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (gw_node) - batadv_gw_node_put(gw_node); - if (neigh_old) - batadv_neigh_node_put(neigh_old); - if (neigh_curr) - batadv_neigh_node_put(neigh_curr); + batadv_orig_node_put(orig_dst_node); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(gw_node); + batadv_neigh_node_put(neigh_old); + batadv_neigh_node_put(neigh_curr); return out_of_range; } diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 2ae5846ef958..95c2ccdaa554 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -27,7 +28,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); -void batadv_gw_node_put(struct batadv_gw_node *gw_node); +void batadv_gw_node_release(struct kref *ref); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); @@ -38,4 +39,17 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); +/** + * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release + * it + * @gw_node: gateway node to free + */ +static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node) +{ + if (!gw_node) + return; + + kref_put(&gw_node->refcount, batadv_gw_node_release); +} + #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index fdde305a198e..9349c76f30c5 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -10,7 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> -#include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 4a6a25d551a8..8a2b78f9c4b2 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,7 +9,6 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> -#include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> @@ -237,8 +236,7 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) real_netdev = dev_get_by_index(real_net, ifindex); out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return real_netdev; } @@ -403,7 +401,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, goto out; } - /* >1 neighbors -> (re)brodcast */ + /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; @@ -458,8 +456,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, @@ -482,8 +479,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: - if (curr_hard_iface) - batadv_hardif_put(curr_hard_iface); + batadv_hardif_put(curr_hard_iface); } static bool @@ -658,8 +654,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) bat_priv->algo_ops->iface.activate(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void @@ -678,43 +673,16 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) } /** - * batadv_master_del_slave() - remove hard_iface from the current master iface - * @slave: the interface enslaved in another master - * @master: the master from which slave has to be removed - * - * Invoke ndo_del_slave on master passing slave as argument. In this way the - * slave is free'd and the master can correctly change its internal state. - * - * Return: 0 on success, a negative value representing the error otherwise - */ -static int batadv_master_del_slave(struct batadv_hard_iface *slave, - struct net_device *master) -{ - int ret; - - if (!master) - return 0; - - ret = -EBUSY; - if (master->netdev_ops->ndo_del_slave) - ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev); - - return ret; -} - -/** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface - * @net: the applicable net namespace - * @iface_name: name of the soft interface + * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name) + struct net_device *soft_iface) { struct batadv_priv *bat_priv; - struct net_device *soft_iface, *master; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); int ret; @@ -724,35 +692,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); - soft_iface = dev_get_by_name(net, iface_name); - - if (!soft_iface) { - soft_iface = batadv_softif_create(net, iface_name); - - if (!soft_iface) { - ret = -ENOMEM; - goto err; - } - - /* dev_get_by_name() increases the reference counter for us */ - dev_hold(soft_iface); - } - - if (!batadv_softif_is_valid(soft_iface)) { - pr_err("Can't create batman mesh interface %s: already exists as regular interface\n", - soft_iface->name); - ret = -EINVAL; - goto err_dev; - } - - /* check if the interface is enslaved in another virtual one and - * in that case unlink it first - */ - master = netdev_master_upper_dev_get(hard_iface->net_dev); - ret = batadv_master_del_slave(hard_iface, master); - if (ret) - goto err_dev; - + dev_hold(soft_iface); hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); @@ -810,7 +750,6 @@ err_upper: err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); -err: batadv_hardif_put(hard_iface); return ret; } @@ -868,8 +807,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) new_if = batadv_hardif_get_active(hard_iface->soft_iface); batadv_primary_if_select(bat_priv, new_if); - if (new_if) - batadv_hardif_put(new_if); + batadv_hardif_put(new_if); } bat_priv->algo_ops->iface.disable(hard_iface); @@ -891,8 +829,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) batadv_hardif_put(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static struct batadv_hard_iface * @@ -1047,8 +984,7 @@ static int batadv_hard_if_event(struct notifier_block *this, hardif_put: batadv_hardif_put(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return NOTIFY_DONE; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 83d11b46a9d8..64f660dbbe54 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -16,7 +16,6 @@ #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> -#include <net/net_namespace.h> /** * enum batadv_hard_if_state - State of a hard interface @@ -75,7 +74,7 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net *net, const char *iface_name); + struct net_device *soft_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); @@ -90,6 +89,9 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, */ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { + if (!hard_iface) + return; + kref_put(&hard_iface->refcount, batadv_hardif_release); } diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 46696759f194..fb251c385a1b 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -18,7 +18,7 @@ #include <linux/stddef.h> #include <linux/types.h> -/* callback to a compare function. should compare 2 element datas for their +/* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index f0e5d1429662..7a93a1e94c40 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -7,7 +7,7 @@ #include "log.h" #include "main.h" -#include <stdarg.h> +#include <linux/stdarg.h> #include "trace.h" diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8f0102b71656..058b8f2eef65 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2021.1" +#define BATADV_SOURCE_VERSION "2021.3" #endif /* B.A.T.M.A.N. parameters */ @@ -88,7 +88,6 @@ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 -#define BATADV_NUM_BCASTS_MAX 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 1d63c8cbbfe7..a3b6658ed789 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -91,8 +91,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); - if (upper) - dev_hold(upper); + dev_hold(upper); rcu_read_unlock(); return upper; @@ -193,53 +192,22 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ -#if IS_ENABLED(CONFIG_IPV6) static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { - struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct net_device *dev = bat_priv->soft_iface; - struct br_ip_list *br_ip_entry, *tmp; - u8 flags = BATADV_MCAST_WANT_NO_RTR6; - int ret; + u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; - /* TODO: ask the bridge if a multicast router is present (the bridge - * is capable of performing proper RFC4286 multicast router - * discovery) instead of searching for a ff02::2 listener here - */ - ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); - if (ret < 0) - return BATADV_NO_FLAGS; - - list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { - /* the bridge snooping does not maintain IPv4 link-local - * addresses - therefore we won't find any IPv4 multicast router - * address here, only IPv6 ones - */ - if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && - ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.dst.ip6)) - flags &= ~BATADV_MCAST_WANT_NO_RTR6; - - list_del(&br_ip_entry->list); - kfree(br_ip_entry); - } + if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) + flags |= BATADV_MCAST_WANT_NO_RTR4; + if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) + flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } -#else -static inline u8 -batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, - struct net_device *bridge) -{ - if (bridge) - return BATADV_NO_FLAGS; - else - return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; -} -#endif /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags @@ -540,8 +508,7 @@ batadv_mcast_mla_softif_get(struct net_device *dev, } out: - if (bridge) - dev_put(bridge); + dev_put(bridge); return ret4 + ret6; } @@ -2270,12 +2237,11 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb, } out: - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); if (!ret && primary_if) *primary_if = hard_iface; - else if (hard_iface) + else batadv_hardif_put(hard_iface); return ret; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index f317d206b411..29276284d281 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -359,15 +359,13 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); genlmsg_end(msg, hdr); return 0; nla_put_failure: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -814,6 +812,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, @@ -1045,6 +1047,10 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg, bat_priv->soft_iface->ifindex)) goto nla_put_failure; + if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, + bat_priv->soft_iface->name)) + goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 4bb76b434d07..9f06132e007d 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -217,6 +217,9 @@ static void batadv_nc_node_release(struct kref *ref) */ static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { + if (!nc_node) + return; + kref_put(&nc_node->refcount, batadv_nc_node_release); } @@ -241,6 +244,9 @@ static void batadv_nc_path_release(struct kref *ref) */ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { + if (!nc_path) + return; + kref_put(&nc_path->refcount, batadv_nc_path_release); } @@ -930,10 +936,8 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, out_nc_node->last_seen = jiffies; out: - if (in_nc_node) - batadv_nc_node_put(in_nc_node); - if (out_nc_node) - batadv_nc_node_put(out_nc_node); + batadv_nc_node_put(in_nc_node); + batadv_nc_node_put(out_nc_node); } /** @@ -1209,14 +1213,10 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: - if (router_neigh) - batadv_neigh_node_put(router_neigh); - if (router_coding) - batadv_neigh_node_put(router_coding); - if (router_neigh_ifinfo) - batadv_neigh_ifinfo_put(router_neigh_ifinfo); - if (router_coding_ifinfo) - batadv_neigh_ifinfo_put(router_coding_ifinfo); + batadv_neigh_node_put(router_neigh); + batadv_neigh_node_put(router_coding); + batadv_neigh_ifinfo_put(router_neigh_ifinfo); + batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index da7249448474..aadc653ca1d8 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -177,7 +177,7 @@ out: * and queue for free after rcu grace period * @ref: kref pointer of the originator-vlan object */ -static void batadv_orig_node_vlan_release(struct kref *ref) +void batadv_orig_node_vlan_release(struct kref *ref) { struct batadv_orig_node_vlan *orig_vlan; @@ -187,16 +187,6 @@ static void batadv_orig_node_vlan_release(struct kref *ref) } /** - * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release - * the originator-vlan object - * @orig_vlan: the originator-vlan object to release - */ -void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) -{ - kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); -} - -/** * batadv_originator_init() - Initialize all originator structures * @bat_priv: the bat priv with all the soft interface information * @@ -231,7 +221,7 @@ err: * free after rcu grace period * @ref: kref pointer of the neigh_ifinfo */ -static void batadv_neigh_ifinfo_release(struct kref *ref) +void batadv_neigh_ifinfo_release(struct kref *ref) { struct batadv_neigh_ifinfo *neigh_ifinfo; @@ -244,21 +234,11 @@ static void batadv_neigh_ifinfo_release(struct kref *ref) } /** - * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release - * the neigh_ifinfo - * @neigh_ifinfo: the neigh_ifinfo object to release - */ -void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) -{ - kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); -} - -/** * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period * @ref: kref pointer of the neigh_node */ -static void batadv_hardif_neigh_release(struct kref *ref) +void batadv_hardif_neigh_release(struct kref *ref) { struct batadv_hardif_neigh_node *hardif_neigh; @@ -274,21 +254,11 @@ static void batadv_hardif_neigh_release(struct kref *ref) } /** - * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter - * and possibly release it - * @hardif_neigh: hardif neigh neighbor to free - */ -void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) -{ - kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); -} - -/** * batadv_neigh_node_release() - release neigh_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_node */ -static void batadv_neigh_node_release(struct kref *ref) +void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; @@ -309,16 +279,6 @@ static void batadv_neigh_node_release(struct kref *ref) } /** - * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly - * release it - * @neigh_node: neigh neighbor to free - */ -void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) -{ - kref_put(&neigh_node->refcount, batadv_neigh_node_release); -} - -/** * batadv_orig_router_get() - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or @@ -704,8 +664,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node, out: spin_unlock_bh(&orig_node->neigh_list_lock); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } @@ -797,14 +756,10 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (hardif) - batadv_hardif_put(hardif); - if (hard_iface) - dev_put(hard_iface); - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(hardif); + dev_put(hard_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } @@ -814,7 +769,7 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) * free after rcu grace period * @ref: kref pointer of the orig_ifinfo */ -static void batadv_orig_ifinfo_release(struct kref *ref) +void batadv_orig_ifinfo_release(struct kref *ref) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; @@ -826,23 +781,12 @@ static void batadv_orig_ifinfo_release(struct kref *ref) /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); - if (router) - batadv_neigh_node_put(router); + batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release - * the orig_ifinfo - * @orig_ifinfo: the orig_ifinfo object to release - */ -void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) -{ - kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); -} - -/** * batadv_orig_node_free_rcu() - free the orig_node * @rcu: rcu pointer of the orig_node */ @@ -865,7 +809,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) * free after rcu grace period * @ref: kref pointer of the orig_node */ -static void batadv_orig_node_release(struct kref *ref) +void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; @@ -895,8 +839,7 @@ static void batadv_orig_node_release(struct kref *ref) orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); - if (last_candidate) - batadv_orig_ifinfo_put(last_candidate); + batadv_orig_ifinfo_put(last_candidate); spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { @@ -912,16 +855,6 @@ static void batadv_orig_node_release(struct kref *ref) } /** - * batadv_orig_node_put() - decrement the orig node refcounter and possibly - * release it - * @orig_node: the orig node to free - */ -void batadv_orig_node_put(struct batadv_orig_node *orig_node) -{ - kref_put(&orig_node->refcount, batadv_orig_node_release); -} - -/** * batadv_originator_free() - Free all originator structures * @bat_priv: the bat priv with all the soft interface information */ @@ -1213,8 +1146,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, if (!kref_get_unless_zero(&neigh->refcount)) continue; - if (best) - batadv_neigh_node_put(best); + batadv_neigh_node_put(best); best = neigh; } @@ -1259,8 +1191,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, BATADV_IF_DEFAULT); batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); - if (best_neigh_node) - batadv_neigh_node_put(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); @@ -1279,8 +1210,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, hard_iface); batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); - if (best_neigh_node) - batadv_neigh_node_put(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); batadv_hardif_put(hard_iface); } @@ -1410,14 +1340,10 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (hardif) - batadv_hardif_put(hardif); - if (hard_iface) - dev_put(hard_iface); - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(hardif); + dev_put(hard_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 805be87d55b8..ea3d69e4e670 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -12,6 +12,7 @@ #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -20,19 +21,18 @@ bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); -void batadv_orig_node_put(struct batadv_orig_node *orig_node); +void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void -batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); +void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); +void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); @@ -42,7 +42,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); -void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); +void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); @@ -52,7 +52,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); -void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); +void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * @@ -61,7 +61,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); -void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); +void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table @@ -82,4 +82,86 @@ static inline u32 batadv_choose_orig(const void *data, u32 size) struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); +/** + * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release + * the originator-vlan object + * @orig_vlan: the originator-vlan object to release + */ +static inline void +batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) +{ + if (!orig_vlan) + return; + + kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); +} + +/** + * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release + * the neigh_ifinfo + * @neigh_ifinfo: the neigh_ifinfo object to release + */ +static inline void +batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) +{ + if (!neigh_ifinfo) + return; + + kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); +} + +/** + * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter + * and possibly release it + * @hardif_neigh: hardif neigh neighbor to free + */ +static inline void +batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) +{ + if (!hardif_neigh) + return; + + kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); +} + +/** + * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly + * release it + * @neigh_node: neigh neighbor to free + */ +static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) +{ + if (!neigh_node) + return; + + kref_put(&neigh_node->refcount, batadv_neigh_node_release); +} + +/** + * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release + * the orig_ifinfo + * @orig_ifinfo: the orig_ifinfo object to release + */ +static inline void +batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) +{ + if (!orig_ifinfo) + return; + + kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); +} + +/** + * batadv_orig_node_put() - decrement the orig node refcounter and possibly + * release it + * @orig_node: the orig node to free + */ +static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) +{ + if (!orig_node) + return; + + kref_put(&orig_node->refcount, batadv_orig_node_release); +} + #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 40f5cffde6a3..970d0d7ccc98 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -101,8 +101,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } /* decrease refcount of previous best neighbor */ - if (curr_router) - batadv_neigh_node_put(curr_router); + batadv_neigh_node_put(curr_router); } /** @@ -128,8 +127,7 @@ void batadv_update_route(struct batadv_priv *bat_priv, _batadv_update_route(bat_priv, orig_node, recv_if, neigh_node); out: - if (router) - batadv_neigh_node_put(router); + batadv_neigh_node_put(router); } /** @@ -269,10 +267,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, goto out; } out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); kfree_skb(skb); @@ -324,10 +320,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, skb = NULL; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); kfree_skb(skb); @@ -425,8 +419,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, skb = NULL; put_orig_node: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); free_skb: kfree_skb(skb); @@ -513,8 +506,7 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node, orig_node->last_bonding_candidate = new_candidate; spin_unlock_bh(&orig_node->neigh_list_lock); - if (old_candidate) - batadv_orig_ifinfo_put(old_candidate); + batadv_orig_ifinfo_put(old_candidate); } /** @@ -656,8 +648,7 @@ next: batadv_orig_ifinfo_put(next_candidate); } - if (last_candidate) - batadv_orig_ifinfo_put(last_candidate); + batadv_orig_ifinfo_put(last_candidate); return router; } @@ -785,10 +776,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); return ret; } @@ -1031,8 +1020,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, orig_node); rx_success: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -1182,9 +1170,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_bcast_packet *bcast_packet; struct ethhdr *ethhdr; int hdr_size = sizeof(*bcast_packet); - int ret = NET_RX_DROP; s32 seq_diff; u32 seqno; + int ret; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) @@ -1210,7 +1198,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (batadv_is_my_mac(bat_priv, bcast_packet->orig)) goto free_skb; - if (bcast_packet->ttl < 2) + if (bcast_packet->ttl-- < 2) goto free_skb; orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig); @@ -1249,7 +1237,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet)); /* rebroadcast packet */ - batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false); + ret = batadv_forw_bcast_packet(bat_priv, skb, 0, false); + if (ret == NETDEV_TX_BUSY) + goto free_skb; /* don't hand the broadcast up if it is from an originator * from the same backbone. @@ -1275,8 +1265,8 @@ spin_unlock: spin_unlock_bh(&orig_node->bcast_seqno_lock); free_skb: kfree_skb(skb); + ret = NET_RX_DROP; out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 157abe92d827..477d85a3b558 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -152,8 +152,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); #endif return ret; @@ -309,8 +308,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return ret; } @@ -425,8 +423,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, packet_subtype, orig_node, vid); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -452,8 +449,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, BATADV_P_DATA, orig_node, vid); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -474,10 +470,8 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, else consume_skb(forw_packet->skb); - if (forw_packet->if_incoming) - batadv_hardif_put(forw_packet->if_incoming); - if (forw_packet->if_outgoing) - batadv_hardif_put(forw_packet->if_outgoing); + batadv_hardif_put(forw_packet->if_incoming); + batadv_hardif_put(forw_packet->if_outgoing); if (forw_packet->queue_left) atomic_inc(forw_packet->queue_left); kfree(forw_packet); @@ -737,57 +731,52 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, } /** - * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends + * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to queue on * - * add a broadcast packet to the queue and setup timers. broadcast packets + * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * - * The skb is not consumed, so the caller should make sure that the - * skb is freed. + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the original skb might not be + * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet) +static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) { - struct batadv_hard_iface *primary_if; struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; + unsigned long send_time = jiffies; struct sk_buff *newskb; - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) + newskb = skb_clone(skb, GFP_ATOMIC); + if (!newskb) goto err; - newskb = skb_copy(skb, GFP_ATOMIC); - if (!newskb) { - batadv_hardif_put(primary_if); - goto err; - } - - forw_packet = batadv_forw_packet_alloc(primary_if, NULL, + forw_packet = batadv_forw_packet_alloc(if_in, if_out, &bat_priv->bcast_queue_left, bat_priv, newskb); - batadv_hardif_put(primary_if); if (!forw_packet) goto err_packet_free; - /* as we have a copy now, it is safe to decrease the TTL */ - bcast_packet = (struct batadv_bcast_packet *)newskb->data; - bcast_packet->ttl--; - forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); - batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay); + send_time += delay ? delay : msecs_to_jiffies(5); + + batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return NETDEV_TX_OK; err_packet_free: @@ -797,9 +786,222 @@ err: } /** + * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * @if_in: the interface where the packet was received on + * @if_out: the outgoing interface to forward to + * + * Transmits a broadcast packet on the specified interface either immediately + * or if a delay is given after that. Furthermore, queues additional + * retransmissions if this interface is a wireless one. + * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the original skb might not be + * modifiable anymore. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet, + struct batadv_hard_iface *if_in, + struct batadv_hard_iface *if_out) +{ + unsigned int num_bcasts = if_out->num_bcasts; + struct sk_buff *newskb; + int ret = NETDEV_TX_OK; + + if (!delay) { + newskb = skb_clone(skb, GFP_ATOMIC); + if (!newskb) + return NETDEV_TX_BUSY; + + batadv_send_broadcast_skb(newskb, if_out); + num_bcasts--; + } + + /* delayed broadcast or rebroadcasts? */ + if (num_bcasts >= 1) { + BATADV_SKB_CB(skb)->num_bcasts = num_bcasts; + + ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay, + own_packet, if_in, + if_out); + } + + return ret; +} + +/** + * batadv_send_no_broadcast() - check whether (re)broadcast is necessary + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to check + * @own_packet: true if it is a self-generated broadcast packet + * @if_out: the outgoing interface checked and considered for (re)broadcast + * + * Return: False if a packet needs to be (re)broadcasted on the given interface, + * true otherwise. + */ +static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, + struct sk_buff *skb, bool own_packet, + struct batadv_hard_iface *if_out) +{ + struct batadv_hardif_neigh_node *neigh_node = NULL; + struct batadv_bcast_packet *bcast_packet; + u8 *orig_neigh; + u8 *neigh_addr; + char *type; + int ret; + + if (!own_packet) { + neigh_addr = eth_hdr(skb)->h_source; + neigh_node = batadv_hardif_neigh_get(if_out, + neigh_addr); + } + + bcast_packet = (struct batadv_bcast_packet *)skb->data; + orig_neigh = neigh_node ? neigh_node->orig : NULL; + + ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, + orig_neigh); + + batadv_hardif_neigh_put(neigh_node); + + /* ok, may broadcast */ + if (!ret) + return false; + + /* no broadcast */ + switch (ret) { + case BATADV_HARDIF_BCAST_NORECIPIENT: + type = "no neighbor"; + break; + case BATADV_HARDIF_BCAST_DUPFWD: + type = "single neighbor is source"; + break; + case BATADV_HARDIF_BCAST_DUPORIG: + type = "single neighbor is originator"; + break; + default: + type = "unknown"; + } + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "BCAST packet from orig %pM on %s suppressed: %s\n", + bcast_packet->orig, + if_out->net_dev->name, type); + + return true; +} + +/** + * __batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the given skb might not be + * modifiable anymore. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface *primary_if; + int ret = NETDEV_TX_OK; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return NETDEV_TX_BUSY; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + + if (batadv_send_no_broadcast(bat_priv, skb, own_packet, + hard_iface)) { + batadv_hardif_put(hard_iface); + continue; + } + + ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay, + own_packet, primary_if, + hard_iface); + batadv_hardif_put(hard_iface); + + if (ret == NETDEV_TX_BUSY) + break; + } + rcu_read_unlock(); + + batadv_hardif_put(primary_if); + return ret; +} + +/** + * batadv_forw_bcast_packet() - forward and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. + */ +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); +} + +/** + * batadv_send_bcast_packet() - send and queue a broadcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: broadcast packet to add + * @delay: number of jiffies to wait before sending + * @own_packet: true if it is a self-generated broadcast packet + * + * Transmits a broadcast packet either immediately or if a delay is given + * after that. Furthermore, queues additional retransmissions on wireless + * interfaces. + * + * Consumes the provided skb. + */ +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet) +{ + __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); + consume_skb(skb); +} + +/** * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check - * @hard_iface: the interface to check on * * Checks whether a given packet has any (re)transmissions left on the provided * interface. @@ -811,28 +1013,20 @@ err: * Return: True if (re)transmissions are left, false otherwise. */ static bool -batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet, - struct batadv_hard_iface *hard_iface) +batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet) { - unsigned int max; - - if (hard_iface) - max = hard_iface->num_bcasts; - else - max = BATADV_NUM_BCASTS_MAX; - - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts < max; + return BATADV_SKB_CB(forw_packet->skb)->num_bcasts; } /** - * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a + * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a * packet - * @forw_packet: the packet to increase the counter for + * @forw_packet: the packet to decrease the counter for */ static void -batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) +batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet) { - BATADV_SKB_CB(forw_packet->skb)->num_bcasts++; + BATADV_SKB_CB(forw_packet->skb)->num_bcasts--; } /** @@ -843,30 +1037,30 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) */ bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) { - return BATADV_SKB_CB(forw_packet->skb)->num_bcasts > 0; + unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts; + + return num_bcasts != forw_packet->if_outgoing->num_bcasts; } +/** + * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet + * @work: work queue item + * + * Transmits a queued broadcast packet and if necessary reschedules it. + */ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { - struct batadv_hard_iface *hard_iface; - struct batadv_hardif_neigh_node *neigh_node; - struct delayed_work *delayed_work; + unsigned long send_time = jiffies + msecs_to_jiffies(5); struct batadv_forw_packet *forw_packet; - struct batadv_bcast_packet *bcast_packet; - struct sk_buff *skb1; - struct net_device *soft_iface; + struct delayed_work *delayed_work; struct batadv_priv *bat_priv; - unsigned long send_time = jiffies + msecs_to_jiffies(5); + struct sk_buff *skb1; bool dropped = false; - u8 *neigh_addr; - u8 *orig_neigh; - int ret = 0; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); - soft_iface = forw_packet->if_incoming->soft_iface; - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; @@ -878,76 +1072,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) goto out; } - bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data; - - /* rebroadcast packet */ - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) - continue; - - if (!batadv_forw_packet_bcasts_left(forw_packet, hard_iface)) - continue; - - if (forw_packet->own) { - neigh_node = NULL; - } else { - neigh_addr = eth_hdr(forw_packet->skb)->h_source; - neigh_node = batadv_hardif_neigh_get(hard_iface, - neigh_addr); - } - - orig_neigh = neigh_node ? neigh_node->orig : NULL; - - ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig, - orig_neigh); - - if (ret) { - char *type; - - switch (ret) { - case BATADV_HARDIF_BCAST_NORECIPIENT: - type = "no neighbor"; - break; - case BATADV_HARDIF_BCAST_DUPFWD: - type = "single neighbor is source"; - break; - case BATADV_HARDIF_BCAST_DUPORIG: - type = "single neighbor is originator"; - break; - default: - type = "unknown"; - } - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n", - bcast_packet->orig, - hard_iface->net_dev->name, type); - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - continue; - } - - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); - - if (!kref_get_unless_zero(&hard_iface->refcount)) - continue; - - /* send a copy of the saved skb */ - skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); - if (skb1) - batadv_send_broadcast_skb(skb1, hard_iface); - - batadv_hardif_put(hard_iface); - } - rcu_read_unlock(); + /* send a copy of the saved skb */ + skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); + if (!skb1) + goto out; - batadv_forw_packet_bcasts_inc(forw_packet); + batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing); + batadv_forw_packet_bcasts_dec(forw_packet); - /* if we still have some more bcasts to send */ - if (batadv_forw_packet_bcasts_left(forw_packet, NULL)) { + if (batadv_forw_packet_bcasts_left(forw_packet)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 2b0daf8b2bc4..08af251b765c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -39,10 +39,14 @@ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface); int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh_node); -int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, - const struct sk_buff *skb, - unsigned long delay, - bool own_packet); +int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); +void batadv_send_bcast_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned long delay, + bool own_packet); void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6b8181bc3122..0604b0279573 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -26,7 +26,6 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> -#include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> @@ -37,6 +36,7 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> +#include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -191,7 +191,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; - unsigned long brd_delay = 1; + unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; @@ -330,7 +330,7 @@ send: bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; - bcast_packet->ttl = BATADV_TTL; + bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; @@ -346,13 +346,7 @@ send: seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); - batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true); - - /* a copy is stored in the bcast list, therefore removing - * the original skb. - */ - consume_skb(skb); - + batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ @@ -389,10 +383,8 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: - if (mcast_single_orig) - batadv_orig_node_put(mcast_single_orig); - if (primary_if) - batadv_hardif_put(primary_if); + batadv_orig_node_put(mcast_single_orig); + batadv_hardif_put(primary_if); return NETDEV_TX_OK; } @@ -507,7 +499,7 @@ out: * after rcu grace period * @ref: kref pointer of the vlan object */ -static void batadv_softif_vlan_release(struct kref *ref) +void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; @@ -521,19 +513,6 @@ static void batadv_softif_vlan_release(struct kref *ref) } /** - * batadv_softif_vlan_put() - decrease the vlan object refcounter and - * possibly release it - * @vlan: the vlan object to release - */ -void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) -{ - if (!vlan) - return; - - kref_put(&vlan->refcount, batadv_softif_vlan_release); -} - -/** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve @@ -848,18 +827,16 @@ static int batadv_softif_slave_add(struct net_device *dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; - struct net *net = dev_net(dev); int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; - ret = batadv_hardif_enable_interface(hard_iface, net, dev->name); + ret = batadv_hardif_enable_interface(hard_iface, dev); out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -885,8 +862,7 @@ static int batadv_softif_slave_del(struct net_device *dev, ret = 0; out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -1093,38 +1069,6 @@ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, } /** - * batadv_softif_create() - Create and register soft interface - * @net: the applicable net namespace - * @name: name of the new soft interface - * - * Return: newly allocated soft_interface, NULL on errors - */ -struct net_device *batadv_softif_create(struct net *net, const char *name) -{ - struct net_device *soft_iface; - int ret; - - soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, - NET_NAME_UNKNOWN, batadv_softif_init_early); - if (!soft_iface) - return NULL; - - dev_net_set(soft_iface, net); - - soft_iface->rtnl_link_ops = &batadv_link_ops; - - ret = register_netdevice(soft_iface); - if (ret < 0) { - pr_err("Unable to register the batman interface '%s': %i\n", - name, ret); - free_netdev(soft_iface); - return NULL; - } - - return soft_iface; -} - -/** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 38b0ad182584..9f2003f1a497 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -9,22 +9,34 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> -#include <net/net_namespace.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); -struct net_device *batadv_softif_create(struct net *net, const char *name); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); +void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); +/** + * batadv_softif_vlan_put() - decrease the vlan object refcounter and + * possibly release it + * @vlan: the vlan object to release + */ +static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) +{ + if (!vlan) + return; + + kref_put(&vlan->refcount, batadv_softif_vlan_release); +} + #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 789c851732b7..56b9fe97b3b4 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -358,6 +358,9 @@ static void batadv_tp_vars_release(struct kref *ref) */ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) { + if (!tp_vars) + return; + kref_put(&tp_vars->refcount, batadv_tp_vars_release); } @@ -748,12 +751,9 @@ move_twnd: wake_up(&tp_vars->more_bytes); out: - if (likely(primary_if)) - batadv_hardif_put(primary_if); - if (likely(orig_node)) - batadv_orig_node_put(orig_node); - if (likely(tp_vars)) - batadv_tp_vars_put(tp_vars); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_tp_vars_put(tp_vars); } /** @@ -882,10 +882,8 @@ static int batadv_tp_send(void *arg) } out: - if (likely(primary_if)) - batadv_hardif_put(primary_if); - if (likely(orig_node)) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); batadv_tp_sender_cleanup(bat_priv, tp_vars); @@ -1205,10 +1203,8 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, ret = 0; out: - if (likely(orig_node)) - batadv_orig_node_put(orig_node); - if (likely(primary_if)) - batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); return ret; } @@ -1456,8 +1452,7 @@ send_ack: batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv, icmp->timestamp, icmp->session, icmp->uid); out: - if (likely(tp_vars)) - batadv_tp_vars_put(tp_vars); + batadv_tp_vars_put(tp_vars); } /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 434b4f042909..e0b3dace2020 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -247,6 +247,9 @@ static void batadv_tt_local_entry_release(struct kref *ref) static void batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { + if (!tt_local_entry) + return; + kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); } @@ -270,7 +273,7 @@ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ -static void batadv_tt_global_entry_release(struct kref *ref) +void batadv_tt_global_entry_release(struct kref *ref) { struct batadv_tt_global_entry *tt_global_entry; @@ -283,17 +286,6 @@ static void batadv_tt_global_entry_release(struct kref *ref) } /** - * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and - * possibly release it - * @tt_global_entry: tt_global_entry to be free'd - */ -void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) -{ - kref_put(&tt_global_entry->common.refcount, - batadv_tt_global_entry_release); -} - -/** * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for @@ -452,6 +444,9 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) static void batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { + if (!orig_entry) + return; + kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } @@ -818,14 +813,10 @@ check_roaming: ret = true; out: - if (in_hardif) - batadv_hardif_put(in_hardif); - if (in_dev) - dev_put(in_dev); - if (tt_local) - batadv_tt_local_entry_put(tt_local); - if (tt_global) - batadv_tt_global_entry_put(tt_global); + batadv_hardif_put(in_hardif); + dev_put(in_dev); + batadv_tt_local_entry_put(tt_local); + batadv_tt_global_entry_put(tt_global); return ret; } @@ -1215,10 +1206,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -1305,8 +1294,7 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, batadv_tt_local_entry_put(tt_removed_entry); out: - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } @@ -1576,8 +1564,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, sync_flags: batadv_tt_global_sync_flags(tt_global); out: - if (orig_entry) - batadv_tt_orig_list_entry_put(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); spin_unlock_bh(&tt_global->list_lock); } @@ -1750,10 +1737,8 @@ out_remove: tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM; out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -1789,15 +1774,13 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, } /* release the refcount for the "old" best */ - if (best_router) - batadv_neigh_node_put(best_router); + batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } - if (best_router) - batadv_neigh_node_put(best_router); + batadv_neigh_node_put(best_router); return best_entry; } @@ -2003,10 +1986,8 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -2196,10 +2177,8 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, } out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (local_entry) - batadv_tt_local_entry_put(local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(local_entry); } /** @@ -2426,10 +2405,8 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, rcu_read_unlock(); out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(tt_local_entry); return orig_node; } @@ -2606,6 +2583,9 @@ static void batadv_tt_req_node_release(struct kref *ref) */ static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) { + if (!tt_req_node) + return; + kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); } @@ -2987,8 +2967,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); @@ -2999,8 +2978,7 @@ out: spin_unlock_bh(&bat_priv->tt.req_list_lock); } - if (tt_req_node) - batadv_tt_req_node_put(tt_req_node); + batadv_tt_req_node_put(tt_req_node); kfree(tvlv_tt_data); return ret; @@ -3131,10 +3109,8 @@ unlock: spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); out: - if (res_dst_orig_node) - batadv_orig_node_put(res_dst_orig_node); - if (req_dst_orig_node) - batadv_orig_node_put(req_dst_orig_node); + batadv_orig_node_put(res_dst_orig_node); + batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } @@ -3248,10 +3224,8 @@ unlock: spin_unlock_bh(&bat_priv->tt.last_changeset_lock); out: spin_unlock_bh(&bat_priv->tt.commit_lock); - if (orig_node) - batadv_orig_node_put(orig_node); - if (primary_if) - batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; @@ -3336,8 +3310,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, atomic_set(&orig_node->last_ttvn, ttvn); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, @@ -3378,8 +3351,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, goto out; ret = true; out: - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3442,8 +3414,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->tt.req_list_lock); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) @@ -3574,8 +3545,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, &tvlv_roam, sizeof(tvlv_roam)); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) @@ -4170,8 +4140,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, atomic_read(&orig_node->last_ttvn) + 1); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index e1285904f885..d18740d9a22b 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> @@ -28,7 +29,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); -void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry); +void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, @@ -55,4 +56,19 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); +/** + * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and + * possibly release it + * @tt_global_entry: tt_global_entry to be free'd + */ +static inline void +batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) +{ + if (!tt_global_entry) + return; + + kref_put(&tt_global_entry->common.refcount, + batadv_tt_global_entry_release); +} + #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 253f5a33a914..992773376e51 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -50,6 +50,9 @@ static void batadv_tvlv_handler_release(struct kref *ref) */ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { + if (!tvlv_handler) + return; + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } @@ -106,6 +109,9 @@ static void batadv_tvlv_container_release(struct kref *ref) */ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { + if (!tvlv) + return; + kref_put(&tvlv->refcount, batadv_tvlv_container_release); } @@ -438,8 +444,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, ogm_source, orig_node, src, dst, tvlv_value, tvlv_value_cont_len); - if (tvlv_handler) - batadv_tvlv_handler_put(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 97617d02c8f9..fd164a248569 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -103,34 +103,6 @@ static inline bool peer_del(struct lowpan_btle_dev *dev, return false; } -static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev, - bdaddr_t *ba, __u8 type) -{ - struct lowpan_peer *peer; - - BT_DBG("peers %d addr %pMR type %d", atomic_read(&dev->peer_count), - ba, type); - - rcu_read_lock(); - - list_for_each_entry_rcu(peer, &dev->peers, list) { - BT_DBG("dst addr %pMR dst type %d", - &peer->chan->dst, peer->chan->dst_type); - - if (bacmp(&peer->chan->dst, ba)) - continue; - - if (type == peer->chan->dst_type) { - rcu_read_unlock(); - return peer; - } - } - - rcu_read_unlock(); - - return NULL; -} - static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { @@ -195,7 +167,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { - BT_DBG("dst addr %pMR dst type %d ip %pI6c", + BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); @@ -506,7 +478,7 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) local_skb = skb_clone(skb, GFP_ATOMIC); - BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); @@ -549,7 +521,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) if (err) { if (lowpan_cb(skb)->chan) { - BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); @@ -691,7 +663,7 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; - int err = 0; + int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, @@ -818,7 +790,7 @@ static void chan_close_cb(struct l2cap_chan *chan) BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); - BT_DBG("chan %p orig refcnt %d", chan, + BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); @@ -907,14 +879,6 @@ static const struct l2cap_ops bt_6lowpan_chan_ops = { .set_shutdown = l2cap_chan_no_set_shutdown, }; -static inline __u8 bdaddr_type(__u8 type) -{ - if (type == ADDR_LE_DEV_PUBLIC) - return BDADDR_LE_PUBLIC; - else - return BDADDR_LE_RANDOM; -} - static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; @@ -940,7 +904,7 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; - BT_DBG("conn %p dst type %d", conn, dst_type); + BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) @@ -972,7 +936,7 @@ static struct l2cap_chan *bt_6lowpan_listen(void) atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); - BT_DBG("chan %p src type %d", chan, chan->src_type); + BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { @@ -1013,7 +977,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, *conn = (struct l2cap_conn *)hcon->l2cap_data; - BT_DBG("conn %p dst %pMR type %d", *conn, &hcon->dst, hcon->dst_type); + BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } @@ -1155,7 +1119,7 @@ static ssize_t lowpan_control_write(struct file *fp, return -EALREADY; } - BT_DBG("conn %p dst %pMR type %d user %d", conn, + BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 463bad58478b..1fcc482397c3 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -120,7 +120,7 @@ static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*rej)) return -EINVAL; - BT_DBG("ident %d reason %d", hdr->ident, le16_to_cpu(rej->reason)); + BT_DBG("ident %u reason %d", hdr->ident, le16_to_cpu(rej->reason)); skb_pull(skb, sizeof(*rej)); @@ -219,7 +219,7 @@ static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb, cl = (void *) skb->data; while (len >= sizeof(*cl)) { - BT_DBG("Remote AMP id %d type %d status %d", cl->id, cl->type, + BT_DBG("Remote AMP id %u type %u status %u", cl->id, cl->type, cl->status); if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) { @@ -273,7 +273,7 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cl *cl = (void *) skb->data; while (skb->len >= sizeof(*cl)) { - BT_DBG("Controller id %d type %d status %d", cl->id, cl->type, + BT_DBG("Controller id %u type %u status %u", cl->id, cl->type, cl->status); cl = skb_pull(skb, sizeof(*cl)); } @@ -302,7 +302,7 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("id %d", req->id); + BT_DBG("id %u", req->id); hdev = hci_dev_get(req->id); if (!hdev || hdev->dev_type != HCI_AMP) { @@ -344,7 +344,7 @@ static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*rsp)) return -EINVAL; - BT_DBG("id %d status 0x%2.2x", rsp->id, rsp->status); + BT_DBG("id %u status 0x%2.2x", rsp->id, rsp->status); if (rsp->status) return -EINVAL; @@ -373,7 +373,7 @@ static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("id %d", req->id); + BT_DBG("id %u", req->id); /* Make sure that other request is not processed */ tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC); @@ -423,7 +423,7 @@ static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb, assoc_len = len - sizeof(*rsp); - BT_DBG("id %d status 0x%2.2x assoc len %zu", rsp->id, rsp->status, + BT_DBG("id %u status 0x%2.2x assoc len %zu", rsp->id, rsp->status, assoc_len); if (rsp->status) @@ -457,7 +457,7 @@ static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb, if (!hcon) goto done; - BT_DBG("Created hcon %p: loc:%d -> rem:%d", hcon, hdev->id, rsp->id); + BT_DBG("Created hcon %p: loc:%u -> rem:%u", hcon, hdev->id, rsp->id); mgr->bredr_chan->remote_amp_id = rsp->id; @@ -481,7 +481,7 @@ static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id); + BT_DBG("local_id %u, remote_id %u", req->local_id, req->remote_id); memset(&rsp, 0, sizeof(rsp)); @@ -562,7 +562,7 @@ static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb, if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; - BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id); + BT_DBG("local_id %u remote_id %u", req->local_id, req->remote_id); memset(&rsp, 0, sizeof(rsp)); @@ -599,7 +599,7 @@ send_rsp: static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cmd *hdr) { - BT_DBG("ident %d code 0x%2.2x", hdr->ident, hdr->code); + BT_DBG("ident %u code 0x%2.2x", hdr->ident, hdr->code); skb_pull(skb, le16_to_cpu(hdr->len)); return 0; @@ -620,7 +620,7 @@ static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) hdr = (void *) skb->data; len = le16_to_cpu(hdr->len); - BT_DBG("code 0x%2.2x id %d len %u", hdr->code, hdr->ident, len); + BT_DBG("code 0x%2.2x id %u len %u", hdr->code, hdr->ident, len); skb_pull(skb, sizeof(*hdr)); diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index be2d469d6369..2134f92bd7ac 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -78,7 +78,7 @@ struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id) { struct amp_ctrl *ctrl; - BT_DBG("mgr %p id %d", mgr, id); + BT_DBG("mgr %p id %u", mgr, id); mutex_lock(&mgr->amp_ctrls_lock); list_for_each_entry(ctrl, &mgr->amp_ctrls, list) { @@ -179,7 +179,7 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) /* Legacy key */ if (conn->key_type < 3) { - bt_dev_err(hdev, "legacy key type %d", conn->key_type); + bt_dev_err(hdev, "legacy key type %u", conn->key_type); return -EACCES; } @@ -257,7 +257,7 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) struct hci_request req; int err; - BT_DBG("%s handle %d", hdev->name, phy_handle); + BT_DBG("%s handle %u", hdev->name, phy_handle); cp.phy_handle = phy_handle; cp.max_len = cpu_to_le16(hdev->amp_assoc_size); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 43c284158f63..72f47b372705 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -126,8 +126,8 @@ static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len f[i].start = get_unaligned_be16(data++); f[i].end = get_unaligned_be16(data++); - BT_DBG("proto filter start %d end %d", - f[i].start, f[i].end); + BT_DBG("proto filter start %u end %u", + f[i].start, f[i].end); } if (i < BNEP_MAX_PROTO_FILTERS) @@ -266,7 +266,7 @@ static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) break; } - BT_DBG("type 0x%x len %d", h->type, h->len); + BT_DBG("type 0x%x len %u", h->type, h->len); switch (h->type & BNEP_TYPE_MASK) { case BNEP_EXT_CONTROL: @@ -424,7 +424,7 @@ static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) int len = 0, il = 0; u8 type = 0; - BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type); + BT_DBG("skb %p dev %p type %u", skb, skb->dev, skb->pkt_type); if (!skb->dev) { /* Control frame sent by us */ diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index eb41556002e3..f3bedc3b613a 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -74,7 +74,7 @@ static struct cmtp_application *cmtp_application_add(struct cmtp_session *sessio { struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL); - BT_DBG("session %p application %p appl %d", session, app, appl); + BT_DBG("session %p application %p appl %u", session, app, appl); if (!app) return NULL; @@ -135,7 +135,7 @@ static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb) { struct cmtp_scb *scb = (void *) skb->cb; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); scb->id = -1; scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3); @@ -152,7 +152,7 @@ static void cmtp_send_interopmsg(struct cmtp_session *session, struct sk_buff *skb; unsigned char *s; - BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum); + BT_DBG("session %p subcmd 0x%02x appl %u msgnum %u", session, subcmd, appl, msgnum); skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC); if (!skb) { @@ -188,7 +188,7 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s __u16 appl, msgnum, func, info; __u32 controller; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); switch (CAPIMSG_SUBCOMMAND(skb->data)) { case CAPI_CONF: @@ -321,7 +321,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) __u16 appl; __u32 contr; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); if (skb->len < CAPI_MSG_BASELEN) return; @@ -344,7 +344,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) appl = application->appl; CAPIMSG_SETAPPID(skb->data, appl); } else { - BT_ERR("Can't find application with id %d", appl); + BT_ERR("Can't find application with id %u", appl); kfree_skb(skb); return; } @@ -385,8 +385,8 @@ static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_ unsigned char buf[8]; int err = 0, nconn, want = rp->level3cnt; - BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d", - ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); + BT_DBG("ctrl %p appl %u level3cnt %u datablkcnt %u datablklen %u", + ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); application = cmtp_application_add(session, appl); if (!application) { @@ -450,7 +450,7 @@ static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl) struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *application; - BT_DBG("ctrl %p appl %d", ctrl, appl); + BT_DBG("ctrl %p appl %u", ctrl, appl); application = cmtp_application_get(session, CMTP_APPLID, appl); if (!application) { @@ -483,7 +483,7 @@ static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) application = cmtp_application_get(session, CMTP_APPLID, appl); if ((!application) || (application->state != BT_CONNECTED)) { - BT_ERR("Can't find application with id %d", appl); + BT_ERR("Can't find application with id %u", appl); return CAPI_ILLAPPNR; } @@ -515,7 +515,7 @@ static int cmtp_proc_show(struct seq_file *m, void *v) seq_printf(m, "ctrl %d\n", session->num); list_for_each_entry(app, &session->applications, list) { - seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); + seq_printf(m, "appl %u -> %u\n", app->appl, app->mapping); } return 0; diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h index c32638dddbf9..f6b9dc4e408f 100644 --- a/net/bluetooth/cmtp/cmtp.h +++ b/net/bluetooth/cmtp/cmtp.h @@ -26,7 +26,7 @@ #include <linux/types.h> #include <net/bluetooth/bluetooth.h> -#define BTNAMSIZ 18 +#define BTNAMSIZ 21 /* CMTP ioctl defines */ #define CMTPCONNADD _IOW('C', 200, int) diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 07cfa3249f83..0a2d78e811cf 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -392,6 +392,11 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) if (!(session->flags & BIT(CMTP_LOOPBACK))) { err = cmtp_attach_device(session); if (err < 0) { + /* Caller will call fput in case of failure, and so + * will cmtp_session kthread. + */ + get_file(session->sock->file); + atomic_inc(&session->terminate); wake_up_interruptible(sk_sleep(session->sock->sk)); up_write(&cmtp_session_sem); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 88ec08978ff4..2b5059a56cda 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -257,7 +257,7 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); - /* When we are master of an established connection and it enters + /* When we are central of an established connection and it enters * the disconnect timeout, then go ahead and try to read the * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. @@ -758,7 +758,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) conn->state = BT_CLOSED; /* If the status indicates successful cancellation of - * the attempt (i.e. Unkown Connection Id) there's no point of + * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. @@ -1109,9 +1109,9 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, hci_req_init(&req, hdev); - /* Disable advertising if we're active. For master role + /* Disable advertising if we're active. For central role * connections most controllers will refuse to connect if - * advertising is enabled, and for slave role connections we + * advertising is enabled, and for peripheral role connections we * anyway have to disable it in order to start directed * advertising. Any registered advertisements will be * re-enabled after the connection attempt is finished. @@ -1119,7 +1119,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_pause_adv_instances(&req); - /* If requested to connect as slave use directed advertising */ + /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning most controllers are unable * to initiate advertising. Simply reject the attempt. @@ -1842,7 +1842,7 @@ u32 hci_conn_get_phy(struct hci_conn *conn) /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: * Table 6.2: Packets defined for synchronous, asynchronous, and - * CSB logical transport types. + * CPB logical transport types. */ switch (conn->type) { case SCO_LINK: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 7d71d104fdfd..8a47a3017d61 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -545,24 +545,24 @@ static void hci_set_event_mask_page_2(struct hci_request *req) u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; bool changed = false; - /* If Connectionless Slave Broadcast master role is supported + /* If Connectionless Peripheral Broadcast central role is supported * enable all necessary events for it. */ - if (lmp_csb_master_capable(hdev)) { + if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ - events[2] |= 0x10; /* Slave Page Response Timeout */ - events[2] |= 0x20; /* CSB Channel Map Change */ + events[2] |= 0x10; /* Peripheral Page Response Timeout */ + events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } - /* If Connectionless Slave Broadcast slave role is supported + /* If Connectionless Peripheral Broadcast peripheral role is supported * enable all necessary events for it. */ - if (lmp_csb_slave_capable(hdev)) { + if (lmp_cpb_peripheral_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ - events[2] |= 0x02; /* CSB Receive */ - events[2] |= 0x04; /* CSB Timeout */ + events[2] |= 0x02; /* CPB Receive */ + events[2] |= 0x04; /* CPB Timeout */ events[2] |= 0x08; /* Truncated Page Complete */ changed = true; } @@ -648,7 +648,7 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) */ /* If the controller supports Extended Scanner Filter - * Policies, enable the correspondig event. + * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising @@ -749,14 +749,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) } if (hdev->commands[26] & 0x40) { - /* Read LE White List Size */ - hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, + /* Read LE Accept List Size */ + hci_req_add(req, HCI_OP_LE_READ_ACCEPT_LIST_SIZE, 0, NULL); } if (hdev->commands[26] & 0x80) { - /* Clear LE White List */ - hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); + /* Clear LE Accept List */ + hci_req_add(req, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL); } if (hdev->commands[34] & 0x40) { @@ -1343,6 +1343,12 @@ int hci_inquiry(void __user *arg) goto done; } + /* Restrict maximum inquiry length to 60 seconds */ + if (ir.length > 60) { + err = -EINVAL; + goto done; + } + hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { @@ -1454,7 +1460,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) } /* Check for valid public address or a configured static - * random adddress, but let the HCI setup proceed to + * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * @@ -1718,26 +1724,28 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) int hci_dev_do_close(struct hci_dev *hdev) { bool auto_off; + int err = 0; BT_DBG("%s %p", hdev->name, hdev); + cancel_delayed_work(&hdev->power_off); + cancel_delayed_work(&hdev->ncmd_timer); + + hci_request_cancel_all(hdev); + hci_req_sync_lock(hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) - hdev->shutdown(hdev); + err = hdev->shutdown(hdev); } - cancel_delayed_work(&hdev->power_off); - - hci_request_cancel_all(hdev); - hci_req_sync_lock(hdev); - if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_sync_unlock(hdev); - return 0; + return err; } hci_leds_update_powered(hdev, false); @@ -1844,7 +1852,7 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_req_sync_unlock(hdev); hci_dev_put(hdev); - return 0; + return err; } int hci_dev_close(__u16 dev) @@ -2777,6 +2785,24 @@ static void hci_cmd_timeout(struct work_struct *work) queue_work(hdev->workqueue, &hdev->cmd_work); } +/* HCI ncmd timer function */ +static void hci_ncmd_timeout(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + ncmd_timer.work); + + bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); + + /* During HCI_INIT phase no events can be injected if the ncmd timer + * triggers since the procedure has its own timeout handling. + */ + if (test_bit(HCI_INIT, &hdev->flags)) + return; + + /* This is an irrecoverable state, inject hardware error event */ + hci_reset_dev(hdev); +} + struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { @@ -3549,7 +3575,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; - /* If trying to estabilish one time connection to disabled + /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { @@ -3694,13 +3720,13 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, /* Suspend consists of two actions: * - First, disconnect everything and make the controller not * connectable (disabling scanning) - * - Second, program event filter/whitelist and enable scan + * - Second, program event filter/accept list and enable scan */ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); if (!ret) state = BT_SUSPEND_DISCONNECT; - /* Only configure whitelist if disconnect succeeded and wake + /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) { @@ -3732,11 +3758,18 @@ done: } /* Alloc HCI device */ -struct hci_dev *hci_alloc_dev(void) +struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; + unsigned int alloc_size; - hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); + alloc_size = sizeof(*hdev); + if (sizeof_priv) { + /* Fixme: May need ALIGN-ment? */ + alloc_size += sizeof_priv; + } + + hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; @@ -3808,14 +3841,14 @@ struct hci_dev *hci_alloc_dev(void) mutex_init(&hdev->req_lock); INIT_LIST_HEAD(&hdev->mgmt_pending); - INIT_LIST_HEAD(&hdev->blacklist); - INIT_LIST_HEAD(&hdev->whitelist); + INIT_LIST_HEAD(&hdev->reject_list); + INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); - INIT_LIST_HEAD(&hdev->le_white_list); + INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); @@ -3841,6 +3874,7 @@ struct hci_dev *hci_alloc_dev(void) init_waitqueue_head(&hdev->suspend_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); + INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_request_setup(hdev); @@ -3849,7 +3883,7 @@ struct hci_dev *hci_alloc_dev(void) return hdev; } -EXPORT_SYMBOL(hci_alloc_dev); +EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) @@ -3976,14 +4010,10 @@ EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { - int id; - BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hci_dev_set_flag(hdev, HCI_UNREGISTER); - id = hdev->id; - write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); @@ -4018,7 +4048,14 @@ void hci_unregister_dev(struct hci_dev *hdev) } device_del(&hdev->dev); + /* Actual cleanup is deferred until hci_release_dev(). */ + hci_dev_put(hdev); +} +EXPORT_SYMBOL(hci_unregister_dev); +/* Release HCI device */ +void hci_release_dev(struct hci_dev *hdev) +{ debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); @@ -4027,8 +4064,8 @@ void hci_unregister_dev(struct hci_dev *hdev) destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); - hci_bdaddr_list_clear(&hdev->blacklist); - hci_bdaddr_list_clear(&hdev->whitelist); + hci_bdaddr_list_clear(&hdev->reject_list); + hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); @@ -4036,18 +4073,17 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_dev_unlock(hdev); - hci_dev_put(hdev); - - ida_simple_remove(&hci_index_ida, id); + ida_simple_remove(&hci_index_ida, hdev->id); + kfree(hdev); } -EXPORT_SYMBOL(hci_unregister_dev); +EXPORT_SYMBOL(hci_release_dev); /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) @@ -4078,6 +4114,8 @@ int hci_reset_dev(struct hci_dev *hdev) hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); + bt_dev_err(hdev, "Injecting HCI hardware error event"); + /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } @@ -4284,7 +4322,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } -/* Send HCI command and wait for command commplete event */ +/* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 47f4f21fbc1a..841393389f7b 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -125,7 +125,7 @@ static int device_list_show(struct seq_file *f, void *ptr) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->whitelist, list) + list_for_each_entry(b, &hdev->accept_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); list_for_each_entry(p, &hdev->le_conn_params, list) { seq_printf(f, "%pMR (type %u) %u\n", &p->addr, p->addr_type, @@ -144,7 +144,7 @@ static int blacklist_show(struct seq_file *f, void *p) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->blacklist, list) + list_for_each_entry(b, &hdev->reject_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); hci_dev_unlock(hdev); @@ -784,7 +784,7 @@ static int white_list_show(struct seq_file *f, void *ptr) struct bdaddr_list *b; hci_dev_lock(hdev); - list_for_each_entry(b, &hdev->le_white_list, list) + list_for_each_entry(b, &hdev->le_accept_list, list) seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); hci_dev_unlock(hdev); @@ -1195,7 +1195,7 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &force_static_address_fops); debugfs_create_u8("white_list_size", 0444, hdev->debugfs, - &hdev->le_white_list_size); + &hdev->le_accept_list_size); debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, &white_list_fops); debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 016b2999f219..0bca035bf2dc 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -40,6 +40,8 @@ #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" +#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000) + /* Handle HCI Event packets */ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb, @@ -236,7 +238,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) hdev->ssp_debug_mode = 0; - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); } @@ -1171,6 +1173,12 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&hdev->random_addr, sent); + if (!bacmp(&hdev->rpa, sent)) { + hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); + queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, + secs_to_jiffies(hdev->rpa_timeout)); + } + hci_dev_unlock(hdev); } @@ -1201,24 +1209,30 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, { __u8 status = *((__u8 *) skb->data); struct hci_cp_le_set_adv_set_rand_addr *cp; - struct adv_info *adv_instance; + struct adv_info *adv; if (status) return; cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR); - if (!cp) + /* Update only in case the adv instance since handle 0x00 shall be using + * HCI_OP_LE_SET_RANDOM_ADDR since that allows both extended and + * non-extended adverting. + */ + if (!cp || !cp->handle) return; hci_dev_lock(hdev); - if (!cp->handle) { - /* Store in hdev for instance 0 (Set adv and Directed advs) */ - bacpy(&hdev->random_addr, &cp->bdaddr); - } else { - adv_instance = hci_find_adv_instance(hdev, cp->handle); - if (adv_instance) - bacpy(&adv_instance->random_addr, &cp->bdaddr); + adv = hci_find_adv_instance(hdev, cp->handle); + if (adv) { + bacpy(&adv->random_addr, &cp->bdaddr); + if (!bacmp(&hdev->rpa, &cp->bdaddr)) { + adv->rpa_expired = false; + queue_delayed_work(hdev->workqueue, + &adv->rpa_expired_cb, + secs_to_jiffies(hdev->rpa_timeout)); + } } hci_dev_unlock(hdev); @@ -1277,7 +1291,9 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *set; __u8 status = *((__u8 *) skb->data); + struct adv_info *adv = NULL, *n; BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1288,22 +1304,48 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, if (!cp) return; + set = (void *)cp->data; + hci_dev_lock(hdev); + if (cp->num_of_sets) + adv = hci_find_adv_instance(hdev, set->handle); + if (cp->enable) { struct hci_conn *conn; hci_dev_set_flag(hdev, HCI_LE_ADV); + if (adv) + adv->enabled = true; + conn = hci_lookup_le_connect(hdev); if (conn) queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, conn->conn_timeout); } else { + if (adv) { + adv->enabled = false; + /* If just one instance was disabled check if there are + * any other instance enabled before clearing HCI_LE_ADV + */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) { + if (adv->enabled) + goto unlock; + } + } else { + /* All instances shall be considered disabled */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) + adv->enabled = false; + } + hci_dev_clear_flag(hdev, HCI_LE_ADV); } +unlock: hci_dev_unlock(hdev); } @@ -1492,21 +1534,21 @@ static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, hdev->le_num_of_adv_sets = rp->num_of_sets; } -static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_read_accept_list_size(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_rp_le_read_white_list_size *rp = (void *) skb->data; + struct hci_rp_le_read_accept_list_size *rp = (void *)skb->data; BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); if (rp->status) return; - hdev->le_white_list_size = rp->size; + hdev->le_accept_list_size = rp->size; } -static void hci_cc_le_clear_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_clear_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -1515,13 +1557,13 @@ static void hci_cc_le_clear_white_list(struct hci_dev *hdev, if (status) return; - hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_accept_list); } -static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_add_to_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_cp_le_add_to_white_list *sent; + struct hci_cp_le_add_to_accept_list *sent; __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1529,18 +1571,18 @@ static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, if (status) return; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST); + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); if (!sent) return; - hci_bdaddr_list_add(&hdev->le_white_list, &sent->bdaddr, - sent->bdaddr_type); + hci_bdaddr_list_add(&hdev->le_accept_list, &sent->bdaddr, + sent->bdaddr_type); } -static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, - struct sk_buff *skb) +static void hci_cc_le_del_from_accept_list(struct hci_dev *hdev, + struct sk_buff *skb) { - struct hci_cp_le_del_from_white_list *sent; + struct hci_cp_le_del_from_accept_list *sent; __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1548,11 +1590,11 @@ static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, if (status) return; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST); + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST); if (!sent) return; - hci_bdaddr_list_del(&hdev->le_white_list, &sent->bdaddr, + hci_bdaddr_list_del(&hdev->le_accept_list, &sent->bdaddr, sent->bdaddr_type); } @@ -2069,7 +2111,7 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, if (conn && (conn->state == BT_CONFIG || conn->state == BT_CONNECTED) && !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, name, name_len); + mgmt_device_connected(hdev, conn, name, name_len); if (discov->state == DISCOVERY_STOPPED) return; @@ -2306,19 +2348,20 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { - u8 type = conn->type; - mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); + if (conn->type == LE_LINK) { + hdev->cur_adv_instance = conn->adv_instance; + hci_req_reenable_advertising(hdev); + } + /* If the disconnection failed for any reason, the upper layer * does not retry to disconnect in current implementation. * Hence, we need to do some basic cleanup here and re-enable * advertising if necessary. */ hci_conn_del(conn); - if (type == LE_LINK) - hci_req_reenable_advertising(hdev); } hci_dev_unlock(hdev); @@ -2367,7 +2410,7 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, /* We don't want the connection attempt to stick around * indefinitely since LE doesn't have a page timeout concept * like BR/EDR. Set a timer for any connection that doesn't use - * the white list for connecting. + * the accept list for connecting. */ if (filter_policy == HCI_LE_USE_PEER_ADDR) queue_delayed_work(conn->hdev->workqueue, @@ -2623,7 +2666,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * only used during suspend. */ if (ev->link_type == ACL_LINK && - hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr, BDADDR_BREDR)) { conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr, @@ -2745,19 +2788,19 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (hci_bdaddr_list_lookup(&hdev->blacklist, &ev->bdaddr, + if (hci_bdaddr_list_lookup(&hdev->reject_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); return; } - /* Require HCI_CONNECTABLE or a whitelist entry to accept the + /* Require HCI_CONNECTABLE or an accept list entry to accept the * connection. These features are only touched through mgmt so * only do the checks if HCI_MGMT is set. */ if (hci_dev_test_flag(hdev, HCI_MGMT) && !hci_dev_test_flag(hdev, HCI_CONNECTABLE) && - !hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, &ev->bdaddr, + !hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); return; @@ -2795,9 +2838,9 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&cp.bdaddr, &ev->bdaddr); if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) - cp.role = 0x00; /* Become master */ + cp.role = 0x00; /* Become central */ else - cp.role = 0x01; /* Remain slave */ + cp.role = 0x01; /* Remain peripheral */ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); } else if (!(flags & HCI_PROTO_DEFER)) { @@ -2844,7 +2887,6 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) struct hci_conn_params *params; struct hci_conn *conn; bool mgmt_connected; - u8 type; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -2899,10 +2941,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } } - type = conn->type; - hci_disconn_cfm(conn, ev->reason); - hci_conn_del(conn); /* The suspend notifier is waiting for all devices to disconnect so * clear the bit from pending tasks and inform the wait queue. @@ -2922,8 +2961,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * or until a connection is created or until the Advertising * is timed out due to Directed Advertising." */ - if (type == LE_LINK) + if (conn->type == LE_LINK) { + hdev->cur_adv_instance = conn->adv_instance; hci_req_reenable_advertising(hdev); + } + + hci_conn_del(conn); unlock: hci_dev_unlock(hdev); @@ -3256,7 +3299,7 @@ static void hci_remote_features_evt(struct hci_dev *hdev, cp.pscan_rep_mode = 0x02; hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; @@ -3268,6 +3311,21 @@ unlock: hci_dev_unlock(hdev); } +static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd) +{ + cancel_delayed_work(&hdev->cmd_timer); + + if (!test_bit(HCI_RESET, &hdev->flags)) { + if (ncmd) { + cancel_delayed_work(&hdev->ncmd_timer); + atomic_set(&hdev->cmd_cnt, 1); + } else { + schedule_delayed_work(&hdev->ncmd_timer, + HCI_NCMD_TIMEOUT); + } + } +} + static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, u16 *opcode, u8 *status, hci_req_complete_t *req_complete, @@ -3521,20 +3579,20 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_scan_enable(hdev, skb); break; - case HCI_OP_LE_READ_WHITE_LIST_SIZE: - hci_cc_le_read_white_list_size(hdev, skb); + case HCI_OP_LE_READ_ACCEPT_LIST_SIZE: + hci_cc_le_read_accept_list_size(hdev, skb); break; - case HCI_OP_LE_CLEAR_WHITE_LIST: - hci_cc_le_clear_white_list(hdev, skb); + case HCI_OP_LE_CLEAR_ACCEPT_LIST: + hci_cc_le_clear_accept_list(hdev, skb); break; - case HCI_OP_LE_ADD_TO_WHITE_LIST: - hci_cc_le_add_to_white_list(hdev, skb); + case HCI_OP_LE_ADD_TO_ACCEPT_LIST: + hci_cc_le_add_to_accept_list(hdev, skb); break; - case HCI_OP_LE_DEL_FROM_WHITE_LIST: - hci_cc_le_del_from_white_list(hdev, skb); + case HCI_OP_LE_DEL_FROM_ACCEPT_LIST: + hci_cc_le_del_from_accept_list(hdev, skb); break; case HCI_OP_LE_READ_SUPPORTED_STATES: @@ -3630,11 +3688,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - if (*opcode != HCI_OP_NOP) - cancel_delayed_work(&hdev->cmd_timer); - - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) - atomic_set(&hdev->cmd_cnt, 1); + handle_cmd_cnt_and_timer(hdev, ev->ncmd); hci_req_cmd_complete(hdev, *opcode, *status, req_complete, req_complete_skb); @@ -3735,11 +3789,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - if (*opcode != HCI_OP_NOP) - cancel_delayed_work(&hdev->cmd_timer); - - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) - atomic_set(&hdev->cmd_cnt, 1); + handle_cmd_cnt_and_timer(hdev, ev->ncmd); /* Indicate request completion if the command failed. Also, if * we're not waiting for a special event and we get a success @@ -4330,7 +4380,7 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, cp.pscan_rep_mode = 0x02; hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; @@ -4373,6 +4423,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->status) { case 0x00: + /* The synchronous connection complete event should only be + * sent once per new connection. Receiving a successful + * complete event when the connection status is already + * BT_CONNECTED means that the device is misbehaving and sent + * multiple complete event packets for the same new connection. + * + * Registering the device more than once can corrupt kernel + * memory, hence upon detecting this invalid event, we report + * an error and ignore the packet. + */ + if (conn->state == BT_CONNECTED) { + bt_dev_err(hdev, "Ignoring connect complete event for existing connection"); + goto unlock; + } + conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -4404,12 +4469,12 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode); - switch (conn->setting & SCO_AIRMODE_MASK) { - case SCO_AIRMODE_CVSD: + switch (ev->air_mode) { + case 0x02: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); break; - case SCO_AIRMODE_TRANSP: + case 0x03: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); break; @@ -5095,9 +5160,64 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, } #endif +static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr, + u8 bdaddr_type, bdaddr_t *local_rpa) +{ + if (conn->out) { + conn->dst_type = bdaddr_type; + conn->resp_addr_type = bdaddr_type; + bacpy(&conn->resp_addr, bdaddr); + + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, local_rpa); + } else if (hci_dev_test_flag(conn->hdev, HCI_PRIVACY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, &conn->hdev->rpa); + } else { + hci_copy_identity_address(conn->hdev, &conn->init_addr, + &conn->init_addr_type); + } + } else { + conn->resp_addr_type = conn->hdev->adv_addr_type; + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->resp_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->resp_addr, local_rpa); + } else if (conn->hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { + /* In case of ext adv, resp_addr will be updated in + * Adv Terminated event. + */ + if (!ext_adv_capable(conn->hdev)) + bacpy(&conn->resp_addr, + &conn->hdev->random_addr); + } else { + bacpy(&conn->resp_addr, &conn->hdev->bdaddr); + } + + conn->init_addr_type = bdaddr_type; + bacpy(&conn->init_addr, bdaddr); + + /* For incoming connections, set the default minimum + * and maximum connection interval. They will be used + * to check if the parameters are in range and if not + * trigger the connection update procedure. + */ + conn->le_conn_min_interval = conn->hdev->le_conn_min_interval; + conn->le_conn_max_interval = conn->hdev->le_conn_max_interval; + } +} + static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, - bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle, - u16 interval, u16 latency, u16 supervision_timeout) + bdaddr_t *bdaddr, u8 bdaddr_type, + bdaddr_t *local_rpa, u8 role, u16 handle, + u16 interval, u16 latency, + u16 supervision_timeout) { struct hci_conn_params *params; struct hci_conn *conn; @@ -5122,8 +5242,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = bdaddr_type; /* If we didn't have a hci_conn object previously - * but we're in master role this must be something - * initiated using a white list. Since white list based + * but we're in central role this must be something + * initiated using an accept list. Since accept list based * connections are not "first class citizens" we don't * have full tracking of them. Therefore, we go ahead * with a "best effort" approach of determining the @@ -5145,32 +5265,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, cancel_delayed_work(&conn->le_conn_timeout); } - if (!conn->out) { - /* Set the responder (our side) address type based on - * the advertising address type. - */ - conn->resp_addr_type = hdev->adv_addr_type; - if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { - /* In case of ext adv, resp_addr will be updated in - * Adv Terminated event. - */ - if (!ext_adv_capable(hdev)) - bacpy(&conn->resp_addr, &hdev->random_addr); - } else { - bacpy(&conn->resp_addr, &hdev->bdaddr); - } - - conn->init_addr_type = bdaddr_type; - bacpy(&conn->init_addr, bdaddr); - - /* For incoming connections, set the default minimum - * and maximum connection interval. They will be used - * to check if the parameters are in range and if not - * trigger the connection update procedure. - */ - conn->le_conn_min_interval = hdev->le_conn_min_interval; - conn->le_conn_max_interval = hdev->le_conn_max_interval; - } + le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa); /* Lookup the identity address from the stored connection * address and address type. @@ -5187,6 +5282,23 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = irk->addr_type; } + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && + hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { + switch (conn->dst_type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + conn->dst_type = ADDR_LE_DEV_PUBLIC; + break; + case ADDR_LE_DEV_RANDOM_RESOLVED: + conn->dst_type = ADDR_LE_DEV_RANDOM; + break; + } + } + if (status) { hci_le_conn_failed(conn, status); goto unlock; @@ -5198,18 +5310,25 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, addr_type = BDADDR_LE_RANDOM; /* Drop the connection if the device is blocked */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) { + if (hci_bdaddr_list_lookup(&hdev->reject_list, &conn->dst, addr_type)) { hci_conn_drop(conn); goto unlock; } if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) - mgmt_device_connected(hdev, conn, 0, NULL, 0); + mgmt_device_connected(hdev, conn, NULL, 0); conn->sec_level = BT_SECURITY_LOW; conn->handle = handle; conn->state = BT_CONFIG; + /* Store current advertising instance as connection advertising instance + * when sotfware rotation is in use so it can be re-enabled when + * disconnected. + */ + if (!ext_adv_capable(hdev)) + conn->adv_instance = hdev->cur_adv_instance; + conn->le_conn_interval = interval; conn->le_conn_latency = latency; conn->le_supv_timeout = supervision_timeout; @@ -5217,17 +5336,17 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - /* The remote features procedure is defined for master + /* The remote features procedure is defined for central * role only. So only in case of an initiated connection * request the remote features. * - * If the local controller supports slave-initiated features - * exchange, then requesting the remote features in slave + * If the local controller supports peripheral-initiated features + * exchange, then requesting the remote features in peripheral * role is possible. Otherwise just transition into the * connected state without requesting the remote features. */ if (conn->out || - (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { struct hci_cp_le_read_remote_features cp; cp.handle = __cpu_to_le16(conn->handle); @@ -5264,7 +5383,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + NULL, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5278,7 +5397,7 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + &ev->local_rpa, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5293,17 +5412,35 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data; struct hci_conn *conn; + struct adv_info *adv; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); - if (ev->status) + adv = hci_find_adv_instance(hdev, ev->handle); + + if (ev->status) { + if (!adv) + return; + + /* Remove advertising as it has been terminated */ + hci_remove_adv_instance(hdev, ev->handle); + mgmt_advertising_removed(NULL, hdev, ev->handle); + return; + } + + if (adv) + adv->enabled = false; conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle)); if (conn) { - struct adv_info *adv_instance; + /* Store handle in the connection so the correct advertising + * instance can be re-enabled when disconnected. + */ + conn->adv_instance = ev->handle; - if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) + if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM || + bacmp(&conn->resp_addr, BDADDR_ANY)) return; if (!ev->handle) { @@ -5311,9 +5448,8 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - adv_instance = hci_find_adv_instance(hdev, ev->handle); - if (adv_instance) - bacpy(&conn->resp_addr, &adv_instance->random_addr); + if (adv) + bacpy(&conn->resp_addr, &adv->random_addr); } } @@ -5354,13 +5490,13 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, return NULL; /* Ignore if the device is blocked */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, addr, addr_type)) + if (hci_bdaddr_list_lookup(&hdev->reject_list, addr, addr_type)) return NULL; /* Most controller will fail if we try to create new connections - * while we have an existing one in slave role. + * while we have an existing one in peripheral role. */ - if (hdev->conn_hash.le_num_slave > 0 && + if (hdev->conn_hash.le_num_peripheral > 0 && (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || !(hdev->le_states[3] & 0x10))) return NULL; @@ -5378,7 +5514,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_DIRECT: /* Only devices advertising with ADV_DIRECT_IND are * triggering a connection attempt. This is allowing - * incoming connections from slave devices. + * incoming connections from peripheral devices. */ if (adv_type != LE_ADV_DIRECT_IND) return NULL; @@ -5386,8 +5522,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_ALWAYS: /* Devices advertising with ADV_IND or ADV_DIRECT_IND * are triggering a connection attempt. This means - * that incoming connections from slave device are - * accepted and also outgoing connections to slave + * that incoming connections from peripheral device are + * accepted and also outgoing connections to peripheral * devices are established when found. */ break; @@ -5441,7 +5577,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct hci_conn *conn; bool match; u32 flags; - u8 *ptr, real_len; + u8 *ptr; switch (type) { case LE_ADV_IND: @@ -5472,14 +5608,10 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, break; } - real_len = ptr - data; - - /* Adjust for actual length */ - if (len != real_len) { - bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u", - len, real_len); - len = real_len; - } + /* Adjust for actual length. This handles the case when remote + * device is advertising with incorrect data length. + */ + len = ptr - data; /* If the direct address is present, then this report is from * a LE Direct Advertising Report event. In that case it is @@ -5752,7 +5884,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, if (conn->state == BT_CONFIG) { __u8 status; - /* If the local controller supports slave-initiated + /* If the local controller supports peripheral-initiated * features exchange, but the remote controller does * not, then it is possible that the error code 0x1a * for unsupported remote feature gets returned. @@ -5761,8 +5893,8 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, * transition into connected state and mark it as * successful. */ - if ((hdev->le_features[0] & HCI_LE_SLAVE_FEATURES) && - !conn->out && ev->status == 0x1a) + if (!conn->out && ev->status == 0x1a && + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) status = 0x00; else status = ev->status; @@ -6032,7 +6164,7 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return true; } - /* Check if request ended in Command Status - no way to retreive + /* Check if request ended in Command Status - no way to retrieve * any extra parameters in this case. */ if (hdr->evt == HCI_EV_CMD_STATUS) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index fa9125b782f8..f15626607b2d 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -745,17 +745,17 @@ void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) } } -static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, - u8 bdaddr_type) +static void del_from_accept_list(struct hci_request *req, bdaddr_t *bdaddr, + u8 bdaddr_type) { - struct hci_cp_le_del_from_white_list cp; + struct hci_cp_le_del_from_accept_list cp; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); - bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr, + bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from accept list", &cp.bdaddr, cp.bdaddr_type); - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_DEL_FROM_ACCEPT_LIST, sizeof(cp), &cp); if (use_ll_privacy(req->hdev) && hci_dev_test_flag(req->hdev, HCI_ENABLE_LL_PRIVACY)) { @@ -774,31 +774,31 @@ static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, } } -/* Adds connection to white list if needed. On error, returns -1. */ -static int add_to_white_list(struct hci_request *req, - struct hci_conn_params *params, u8 *num_entries, - bool allow_rpa) +/* Adds connection to accept list if needed. On error, returns -1. */ +static int add_to_accept_list(struct hci_request *req, + struct hci_conn_params *params, u8 *num_entries, + bool allow_rpa) { - struct hci_cp_le_add_to_white_list cp; + struct hci_cp_le_add_to_accept_list cp; struct hci_dev *hdev = req->hdev; - /* Already in white list */ - if (hci_bdaddr_list_lookup(&hdev->le_white_list, ¶ms->addr, + /* Already in accept list */ + if (hci_bdaddr_list_lookup(&hdev->le_accept_list, ¶ms->addr, params->addr_type)) return 0; /* Select filter policy to accept all advertising */ - if (*num_entries >= hdev->le_white_list_size) + if (*num_entries >= hdev->le_accept_list_size) return -1; - /* White list can not be used with RPAs */ + /* Accept list can not be used with RPAs */ if (!allow_rpa && !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { return -1; } - /* During suspend, only wakeable devices can be in whitelist */ + /* During suspend, only wakeable devices can be in accept list */ if (hdev->suspended && !hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, params->current_flags)) return 0; @@ -807,9 +807,9 @@ static int add_to_white_list(struct hci_request *req, cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, ¶ms->addr); - bt_dev_dbg(hdev, "Add %pMR (0x%x) to whitelist", &cp.bdaddr, + bt_dev_dbg(hdev, "Add %pMR (0x%x) to accept list", &cp.bdaddr, cp.bdaddr_type); - hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_ADD_TO_ACCEPT_LIST, sizeof(cp), &cp); if (use_ll_privacy(hdev) && hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) { @@ -837,15 +837,15 @@ static int add_to_white_list(struct hci_request *req, return 0; } -static u8 update_white_list(struct hci_request *req) +static u8 update_accept_list(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct hci_conn_params *params; struct bdaddr_list *b; u8 num_entries = 0; bool pend_conn, pend_report; - /* We allow whitelisting even with RPAs in suspend. In the worst case, - * we won't be able to wake from devices that use the privacy1.2 + /* We allow usage of accept list even with RPAs in suspend. In the worst + * case, we won't be able to wake from devices that use the privacy1.2 * features. Additionally, once we support privacy1.2 and IRK * offloading, we can update this to also check for those conditions. */ @@ -855,13 +855,13 @@ static u8 update_white_list(struct hci_request *req) hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) allow_rpa = true; - /* Go through the current white list programmed into the + /* Go through the current accept list programmed into the * controller one by one and check if that address is still * in the list of pending connections or list of devices to * report. If not present in either list, then queue the * command to remove it from the controller. */ - list_for_each_entry(b, &hdev->le_white_list, list) { + list_for_each_entry(b, &hdev->le_accept_list, list) { pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); @@ -870,14 +870,14 @@ static u8 update_white_list(struct hci_request *req) b->bdaddr_type); /* If the device is not likely to connect or report, - * remove it from the whitelist. + * remove it from the accept list. */ if (!pend_conn && !pend_report) { - del_from_white_list(req, &b->bdaddr, b->bdaddr_type); + del_from_accept_list(req, &b->bdaddr, b->bdaddr_type); continue; } - /* White list can not be used with RPAs */ + /* Accept list can not be used with RPAs */ if (!allow_rpa && !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { @@ -887,27 +887,27 @@ static u8 update_white_list(struct hci_request *req) num_entries++; } - /* Since all no longer valid white list entries have been + /* Since all no longer valid accept list entries have been * removed, walk through the list of pending connections * and ensure that any new device gets programmed into * the controller. * * If the list of the devices is larger than the list of - * available white list entries in the controller, then + * available accept list entries in the controller, then * just abort and return filer policy value to not use the - * white list. + * accept list. */ list_for_each_entry(params, &hdev->pend_le_conns, action) { - if (add_to_white_list(req, params, &num_entries, allow_rpa)) + if (add_to_accept_list(req, params, &num_entries, allow_rpa)) return 0x00; } /* After adding all new pending connections, walk through * the list of pending reports and also add these to the - * white list if there is still space. Abort if space runs out. + * accept list if there is still space. Abort if space runs out. */ list_for_each_entry(params, &hdev->pend_le_reports, action) { - if (add_to_white_list(req, params, &num_entries, allow_rpa)) + if (add_to_accept_list(req, params, &num_entries, allow_rpa)) return 0x00; } @@ -921,7 +921,7 @@ static u8 update_white_list(struct hci_request *req) hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) return 0x00; - /* Select filter policy to use white list */ + /* Select filter policy to use accept list */ return 0x01; } @@ -932,7 +932,7 @@ static bool scan_use_rpa(struct hci_dev *hdev) static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy, - bool addr_resolv) + bool filter_dup, bool addr_resolv) { struct hci_dev *hdev = req->hdev; @@ -997,7 +997,7 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); ext_enable_cp.enable = LE_SCAN_ENABLE; - ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + ext_enable_cp.filter_dup = filter_dup; hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(ext_enable_cp), &ext_enable_cp); @@ -1016,7 +1016,7 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(&enable_cp, 0, sizeof(enable_cp)); enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + enable_cp.filter_dup = filter_dup; hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), &enable_cp); } @@ -1053,6 +1053,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req) u8 own_addr_type; u8 filter_policy; u16 window, interval; + /* Default is to enable duplicates filter */ + u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; /* Background scanning should run with address resolution */ bool addr_resolv = true; @@ -1076,20 +1078,20 @@ void hci_req_add_le_passive_scan(struct hci_request *req) return; bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); - /* Adding or removing entries from the white list must + /* Adding or removing entries from the accept list must * happen before enabling scanning. The controller does - * not allow white list modification while scanning. + * not allow accept list modification while scanning. */ - filter_policy = update_white_list(req); + filter_policy = update_accept_list(req); /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support * for handling directed advertising. * - * So instead of using filter polices 0x00 (no whitelist) - * and 0x01 (whitelist enabled) use the new filter policies - * 0x02 (no whitelist) and 0x03 (whitelist enabled). + * So instead of using filter polices 0x00 (no accept list) + * and 0x01 (accept list enabled) use the new filter policies + * 0x02 (no accept list) and 0x03 (accept list enabled). */ if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) @@ -1106,14 +1108,30 @@ void hci_req_add_le_passive_scan(struct hci_request *req) } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; + + /* Disable duplicates filter when scanning for advertisement + * monitor for the following reasons. + * + * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm + * controllers ignore RSSI_Sampling_Period when the duplicates + * filter is enabled. + * + * For SW pattern filtering, when we're not doing interleaved + * scanning, it is necessary to disable duplicates filter, + * otherwise hosts can only receive one advertisement and it's + * impossible to know if a peer is still in range. + */ + filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; } - bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); + bt_dev_dbg(hdev, "LE passive scan with accept list = %d", + filter_policy); hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, - own_addr_type, filter_policy, addr_resolv); + own_addr_type, filter_policy, filter_dup, + addr_resolv); } static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) @@ -1163,7 +1181,7 @@ static void hci_req_set_event_filter(struct hci_request *req) /* Always clear event filter when starting */ hci_req_clear_event_filter(req); - list_for_each_entry(b, &hdev->whitelist, list) { + list_for_each_entry(b, &hdev->accept_list, list) { if (!hci_conn_test_flag(HCI_CONN_FLAG_REMOTE_WAKEUP, b->current_flags)) continue; @@ -1502,13 +1520,14 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) if (hci_conn_num(hdev, LE_LINK) == 0) return true; - /* Check le_states if there is any connection in slave role. */ - if (hdev->conn_hash.le_num_slave > 0) { - /* Slave connection state and non connectable mode bit 20. */ + /* Check le_states if there is any connection in peripheral role. */ + if (hdev->conn_hash.le_num_peripheral > 0) { + /* Peripheral connection state and non connectable mode bit 20. + */ if (!connectable && !(hdev->le_states[2] & 0x10)) return false; - /* Slave connection state and connectable mode bit 38 + /* Peripheral connection state and connectable mode bit 38 * and scannable bit 21. */ if (connectable && (!(hdev->le_states[4] & 0x40) || @@ -1516,13 +1535,13 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) return false; } - /* Check le_states if there is any connection in master role. */ - if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_slave) { - /* Master connection state and non connectable mode bit 18. */ + /* Check le_states if there is any connection in central role. */ + if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { + /* Central connection state and non connectable mode bit 18. */ if (!connectable && !(hdev->le_states[2] & 0x02)) return false; - /* Master connection state and connectable mode bit 35 and + /* Central connection state and connectable mode bit 35 and * scannable 19. */ if (connectable && (!(hdev->le_states[4] & 0x08) || @@ -1697,30 +1716,33 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) return; if (ext_adv_capable(hdev)) { - struct hci_cp_le_set_ext_scan_rsp_data cp; + struct { + struct hci_cp_le_set_ext_scan_rsp_data cp; + u8 data[HCI_MAX_EXT_AD_LENGTH]; + } pdu; - memset(&cp, 0, sizeof(cp)); + memset(&pdu, 0, sizeof(pdu)); if (instance) len = create_instance_scan_rsp_data(hdev, instance, - cp.data); + pdu.data); else - len = create_default_scan_rsp_data(hdev, cp.data); + len = create_default_scan_rsp_data(hdev, pdu.data); if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) + !memcmp(pdu.data, hdev->scan_rsp_data, len)) return; - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + memcpy(hdev->scan_rsp_data, pdu.data, len); hdev->scan_rsp_data_len = len; - cp.handle = instance; - cp.length = len; - cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu.cp.handle = instance; + pdu.cp.length = len; + pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp), - &cp); + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, + sizeof(pdu.cp) + len, &pdu.cp); } else { struct hci_cp_le_set_scan_rsp_data cp; @@ -1843,26 +1865,30 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) return; if (ext_adv_capable(hdev)) { - struct hci_cp_le_set_ext_adv_data cp; + struct { + struct hci_cp_le_set_ext_adv_data cp; + u8 data[HCI_MAX_EXT_AD_LENGTH]; + } pdu; - memset(&cp, 0, sizeof(cp)); + memset(&pdu, 0, sizeof(pdu)); - len = create_instance_adv_data(hdev, instance, cp.data); + len = create_instance_adv_data(hdev, instance, pdu.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) + memcmp(pdu.data, hdev->adv_data, len) == 0) return; - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + memcpy(hdev->adv_data, pdu.data, len); hdev->adv_data_len = len; - cp.length = len; - cp.handle = instance; - cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu.cp.length = len; + pdu.cp.handle = instance; + pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, + sizeof(pdu.cp) + len, &pdu.cp); } else { struct hci_cp_le_set_adv_data cp; @@ -2046,8 +2072,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, * current RPA has expired then generate a new one. */ if (use_rpa) { - int to; - /* If Controller supports LL Privacy use own address type is * 0x03 */ @@ -2058,14 +2082,10 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, *own_addr_type = ADDR_LE_DEV_RANDOM; if (adv_instance) { - if (!adv_instance->rpa_expired && - !bacmp(&adv_instance->random_addr, &hdev->rpa)) + if (adv_rpa_valid(adv_instance)) return 0; - - adv_instance->rpa_expired = false; } else { - if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && - !bacmp(&hdev->random_addr, &hdev->rpa)) + if (rpa_valid(hdev)) return 0; } @@ -2077,14 +2097,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bacpy(rand_addr, &hdev->rpa); - to = msecs_to_jiffies(hdev->rpa_timeout * 1000); - if (adv_instance) - queue_delayed_work(hdev->workqueue, - &adv_instance->rpa_expired_cb, to); - else - queue_delayed_work(hdev->workqueue, - &hdev->rpa_expired, to); - return 0; } @@ -2127,6 +2139,30 @@ void __hci_req_clear_ext_adv_sets(struct hci_request *req) hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); } +static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) +{ + struct hci_dev *hdev = req->hdev; + + /* If we're advertising or initiating an LE connection we can't + * go ahead and change the random address at this time. This is + * because the eventual initiator address used for the + * subsequently created connection will be undefined (some + * controllers use the new address and others the one we had + * when the operation started). + * + * In this kind of scenario skip the update and let the random + * address be updated at the next cycle. + */ + if (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev)) { + bt_dev_dbg(hdev, "Deferring random address update"); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + return; + } + + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); +} + int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; @@ -2229,6 +2265,13 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) } else { if (!bacmp(&random_addr, &hdev->random_addr)) return 0; + /* Instance 0x00 doesn't have an adv_info, instead it + * uses hdev->random_addr to track its address so + * whenever it needs to be updated this also set the + * random address since hdev->random_addr is shared with + * scan state machine. + */ + set_random_addr(req, &random_addr); } memset(&cp, 0, sizeof(cp)); @@ -2486,30 +2529,6 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, false); } -static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) -{ - struct hci_dev *hdev = req->hdev; - - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the - * subsequently created connection will be undefined (some - * controllers use the new address and others the one we had - * when the operation started). - * - * In this kind of scenario skip the update and let the random - * address be updated at the next cycle. - */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { - bt_dev_dbg(hdev, "Deferring random address update"); - hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); - return; - } - - hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); -} - int hci_update_random_address(struct hci_request *req, bool require_privacy, bool use_rpa, u8 *own_addr_type) { @@ -2521,8 +2540,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * the current RPA in use, then generate a new one. */ if (use_rpa) { - int to; - /* If Controller supports LL Privacy use own address type is * 0x03 */ @@ -2532,8 +2549,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, else *own_addr_type = ADDR_LE_DEV_RANDOM; - if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && - !bacmp(&hdev->random_addr, &hdev->rpa)) + if (rpa_valid(hdev)) return 0; err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); @@ -2544,9 +2560,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, set_random_addr(req, &hdev->rpa); - to = msecs_to_jiffies(hdev->rpa_timeout * 1000); - queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to); - return 0; } @@ -2605,11 +2618,11 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, return 0; } -static bool disconnected_whitelist_entries(struct hci_dev *hdev) +static bool disconnected_accept_list_entries(struct hci_dev *hdev) { struct bdaddr_list *b; - list_for_each_entry(b, &hdev->whitelist, list) { + list_for_each_entry(b, &hdev->accept_list, list) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); @@ -2641,7 +2654,7 @@ void __hci_req_update_scan(struct hci_request *req) return; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || - disconnected_whitelist_entries(hdev)) + disconnected_accept_list_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; @@ -3133,8 +3146,10 @@ static int active_scan(struct hci_request *req, unsigned long opt) uint16_t interval = opt; struct hci_dev *hdev = req->hdev; u8 own_addr_type; - /* White list is not used for discovery */ + /* Accept list is not used for discovery */ u8 filter_policy = 0x00; + /* Default is to enable duplicates filter */ + u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; /* Discovery doesn't require controller address resolution */ bool addr_resolv = false; int err; @@ -3159,9 +3174,26 @@ static int active_scan(struct hci_request *req, unsigned long opt) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; + if (hci_is_adv_monitoring(hdev)) { + /* Duplicate filter should be disabled when some advertisement + * monitor is activated, otherwise AdvMon can only receive one + * advertisement for one peer(*) during active scanning, and + * might report loss to these peers. + * + * Note that different controllers have different meanings of + * |duplicate|. Some of them consider packets with the same + * address as duplicate, and others consider packets with the + * same address and the same RSSI as duplicate. Although in the + * latter case we don't need to disable duplicate filter, but + * it is common to have active scanning for a short period of + * time, the power impact should be neglectable. + */ + filter_dup = LE_SCAN_FILTER_DUP_DISABLE; + } + hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, - filter_policy, addr_resolv); + filter_policy, filter_dup, addr_resolv); return 0; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index eed0dd066e12..f1128c2134f0 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -59,6 +59,17 @@ struct hci_pinfo { char comm[TASK_COMM_LEN]; }; +static struct hci_dev *hci_hdev_from_sock(struct sock *sk) +{ + struct hci_dev *hdev = hci_pi(sk)->hdev; + + if (!hdev) + return ERR_PTR(-EBADFD); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return ERR_PTR(-EPIPE); + return hdev; +} + void hci_sock_set_flag(struct sock *sk, int nr) { set_bit(nr, &hci_pi(sk)->flags); @@ -759,19 +770,13 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) if (event == HCI_DEV_UNREG) { struct sock *sk; - /* Detach sockets from device */ + /* Wake up sockets using this dead device */ read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { - lock_sock(sk); if (hci_pi(sk)->hdev == hdev) { - hci_pi(sk)->hdev = NULL; sk->sk_err = EPIPE; - sk->sk_state = BT_OPEN; sk->sk_state_change(sk); - - hci_dev_put(hdev); } - release_sock(sk); } read_unlock(&hci_sk_list.lock); } @@ -892,7 +897,7 @@ static int hci_sock_release(struct socket *sock) return 0; } -static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) +static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; @@ -902,14 +907,14 @@ static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_bdaddr_list_add(&hdev->blacklist, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); return err; } -static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) +static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; int err; @@ -919,7 +924,7 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_bdaddr_list_del(&hdev->blacklist, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); @@ -930,10 +935,10 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { - struct hci_dev *hdev = hci_pi(sk)->hdev; + struct hci_dev *hdev = hci_hdev_from_sock(sk); - if (!hdev) - return -EBADFD; + if (IS_ERR(hdev)) + return PTR_ERR(hdev); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; @@ -959,12 +964,12 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_add(hdev, (void __user *)arg); + return hci_sock_reject_list_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_del(hdev, (void __user *)arg); + return hci_sock_reject_list_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; @@ -1103,6 +1108,18 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, lock_sock(sk); + /* Allow detaching from dead device and attaching to alive device, if + * the caller wants to re-bind (instead of close) this socket in + * response to hci_sock_dev_event(HCI_DEV_UNREG) notification. + */ + hdev = hci_pi(sk)->hdev; + if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + hci_pi(sk)->hdev = NULL; + sk->sk_state = BT_OPEN; + hci_dev_put(hdev); + } + hdev = NULL; + if (sk->sk_state == BT_BOUND) { err = -EALREADY; goto done; @@ -1130,7 +1147,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * then there has been already an ioctl issued against - * an unbound socket and with that triggerd an open + * an unbound socket and with that triggered an open * notification. Send a close notification first to * allow the state transition to bounded. */ @@ -1326,9 +1343,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been - * assigned, this socket will transtion from + * assigned, this socket will transition from * a raw socket into a control socket. To - * allow for a clean transtion, send the + * allow for a clean transition, send the * close notification first. */ skb = create_monitor_ctrl_close(sk); @@ -1379,9 +1396,9 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - hdev = hci_pi(sk)->hdev; - if (!hdev) { - err = -EBADFD; + hdev = hci_hdev_from_sock(sk); + if (IS_ERR(hdev)) { + err = PTR_ERR(hdev); goto done; } @@ -1743,9 +1760,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto done; } - hdev = hci_pi(sk)->hdev; - if (!hdev) { - err = -EBADFD; + hdev = hci_hdev_from_sock(sk); + if (IS_ERR(hdev)) { + err = PTR_ERR(hdev); goto done; } diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 9874844a95a9..7827639ecf5c 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -83,7 +83,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn) static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); - kfree(hdev); + + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + hci_release_dev(hdev); module_put(THIS_MODULE); } diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 0db48c812662..80848dfc01db 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -508,7 +508,7 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param) { int done_with_skb = 1; - BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param); + BT_DBG("session %p skb %p len %u param 0x%02x", session, skb, skb->len, param); switch (param) { case HIDP_DATA_RTYPE_INPUT: @@ -553,7 +553,7 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session, unsigned char hdr, type, param; int free_skb = 1; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); @@ -589,7 +589,7 @@ static void hidp_recv_intr_frame(struct hidp_session *session, { unsigned char hdr; - BT_DBG("session %p skb %p len %d", session, skb, skb->len); + BT_DBG("session %p skb %p len %u", session, skb, skb->len); hdr = skb->data[0]; skb_pull(skb, 1); @@ -794,7 +794,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->dev.parent = &session->conn->hcon->dev; hid->ll_driver = &hidp_hid_driver; - /* True if device is blacklisted in drivers/hid/hid-quirks.c */ + /* True if device is blocked in drivers/hid/hid-quirks.c */ if (hid_ignore(hid)) { hid_destroy_device(session->hid); session->hid = NULL; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b6a88b8256c7..77ba68209dbd 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1691,7 +1691,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) if (hcon->out) smp_conn_security(hcon, hcon->pending_sec_level); - /* For LE slave connections, make sure the connection interval + /* For LE peripheral connections, make sure the connection interval * is in the range of the minimum and maximum interval that has * been configured for this connection. If not, then trigger * the connection update procedure. @@ -4237,7 +4237,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, hci_dev_lock(hdev); if (hci_dev_test_flag(hdev, HCI_MGMT) && !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) - mgmt_device_connected(hdev, hcon, 0, NULL, 0); + mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0); @@ -6066,7 +6066,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, struct l2cap_ecred_conn_rsp *rsp = (void *) data; struct hci_conn *hcon = conn->hcon; u16 mtu, mps, credits, result; - struct l2cap_chan *chan; + struct l2cap_chan *chan, *tmp; int err = 0, sec_level; int i = 0; @@ -6085,7 +6085,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, cmd_len -= sizeof(*rsp); - list_for_each_entry(chan, &conn->chan_l, list) { + list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { u16 dcid; if (chan->ident != cmd->ident || @@ -6248,7 +6248,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) { - struct l2cap_chan *chan; + struct l2cap_chan *chan, *tmp; struct l2cap_ecred_conn_rsp *rsp = (void *) data; u16 result; @@ -6262,7 +6262,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, if (!result) return 0; - list_for_each_entry(chan, &conn->chan_l, list) { + list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { if (chan->ident != cmd->ident) continue; @@ -7662,7 +7662,7 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) * at least ensure that we ignore incoming data from them. */ if (hcon->type == LE_LINK && - hci_bdaddr_list_lookup(&hcon->hdev->blacklist, &hcon->dst, + hci_bdaddr_list_lookup(&hcon->hdev->reject_list, &hcon->dst, bdaddr_dst_type(hcon))) { kfree_skb(skb); return; @@ -8119,7 +8119,7 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) dst_type = bdaddr_dst_type(hcon); /* If device is blocked, do not create channels for it */ - if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) + if (hci_bdaddr_list_lookup(&hdev->reject_list, &hcon->dst, dst_type)) return; /* Find fixed channels and notify them of the new connection. We diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f9be7f9084d6..cea01e275f1e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -40,7 +40,7 @@ #include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 20 +#define MGMT_REVISION 21 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -252,12 +252,15 @@ static const u8 mgmt_status_table[] = { MGMT_STATUS_TIMEOUT, /* Instant Passed */ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */ MGMT_STATUS_FAILED, /* Transaction Collision */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */ MGMT_STATUS_REJECTED, /* QoS Rejected */ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */ MGMT_STATUS_REJECTED, /* Insufficient Security */ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_BUSY, /* Role Switch Pending */ + MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_FAILED, /* Slot Violation */ MGMT_STATUS_FAILED, /* Role Switch Failed */ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */ @@ -2956,7 +2959,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* When pairing a new device, it is expected to remember * this device for future connections. Adding the connection * parameter information ahead of time allows tracking - * of the slave preferred values and will speed up any + * of the peripheral preferred values and will speed up any * further connection establishment. * * If connection parameters already exist, then they @@ -3341,7 +3344,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, } /* The name is stored in the scan response data and so - * no need to udpate the advertising data here. + * no need to update the advertising data here. */ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); @@ -4058,8 +4061,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + memset(&rp, 0, sizeof(rp)); + if (cp->addr.type == BDADDR_BREDR) { - br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (!br_params) @@ -4127,7 +4132,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { - br_params = hci_bdaddr_list_lookup_with_flags(&hdev->whitelist, + br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); @@ -4274,7 +4279,7 @@ int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status) done: hci_dev_unlock(hdev); - bt_dev_dbg(hdev, "add monitor %d complete, status %d", + bt_dev_dbg(hdev, "add monitor %d complete, status %u", rp.monitor_handle, status); return err; @@ -4499,7 +4504,7 @@ int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status) done: hci_dev_unlock(hdev); - bt_dev_dbg(hdev, "remove monitor %d complete, status %d", + bt_dev_dbg(hdev, "remove monitor %d complete, status %u", rp.monitor_handle, status); return err; @@ -4829,7 +4834,7 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5085,7 +5090,7 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5204,7 +5209,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_bdaddr_list_add(&hdev->blacklist, &cp->addr.bdaddr, + err = hci_bdaddr_list_add(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_FAILED; @@ -5240,7 +5245,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_bdaddr_list_del(&hdev->blacklist, &cp->addr.bdaddr, + err = hci_bdaddr_list_del(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_INVALID_PARAMS; @@ -5298,7 +5303,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, static void enable_advertising_instance(struct hci_dev *hdev, u8 status, u16 opcode) { - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); } static void set_advertising_complete(struct hci_dev *hdev, u8 status, @@ -6164,7 +6169,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, static bool ltk_is_valid(struct mgmt_ltk_info *key) { - if (key->master != 0x00 && key->master != 0x01) + if (key->initiator != 0x00 && key->initiator != 0x01) return false; switch (key->addr.type) { @@ -6242,11 +6247,11 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; - type = key->master ? SMP_LTK : SMP_LTK_SLAVE; + type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_AUTHENTICATED: authenticated = 0x01; - type = key->master ? SMP_LTK : SMP_LTK_SLAVE; + type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_P256_UNAUTH: authenticated = 0x00; @@ -6342,7 +6347,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, handle = __le16_to_cpu(cp->handle); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) { - bt_dev_err(hdev, "unknown handle (%d) in conn_info response", + bt_dev_err(hdev, "unknown handle (%u) in conn_info response", handle); goto unlock; } @@ -6731,7 +6736,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_bdaddr_list_add_with_flags(&hdev->whitelist, + err = hci_bdaddr_list_add_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type, 0); if (err) @@ -6829,7 +6834,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, } if (cp->addr.type == BDADDR_BREDR) { - err = hci_bdaddr_list_del(&hdev->whitelist, + err = hci_bdaddr_list_del(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (err) { @@ -6900,7 +6905,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - list_for_each_entry_safe(b, btmp, &hdev->whitelist, list) { + list_for_each_entry_safe(b, btmp, &hdev->accept_list, list) { device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type); list_del(&b->list); kfree(b); @@ -7199,7 +7204,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, if (!mgmt_rp) goto done; - if (status) + if (eir_len == 0) goto send_rsp; eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV, @@ -7585,6 +7590,9 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; + if (!cur_len) + continue; + if (data[i + 1] == EIR_FLAGS && (!is_adv_data || flags_managed(adv_flags))) return false; @@ -7646,7 +7654,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, struct adv_info *adv_instance, *n; u8 instance; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -7717,7 +7725,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, * advertising. */ if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_NOT_SUPPORTED); if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) @@ -8176,7 +8184,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; - bt_dev_dbg(hdev, "status %d", status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -8641,7 +8649,7 @@ static u8 mgmt_ltk_type(struct smp_ltk *ltk) { switch (ltk->type) { case SMP_LTK: - case SMP_LTK_SLAVE: + case SMP_LTK_RESPONDER: if (ltk->authenticated) return MGMT_LTK_AUTHENTICATED; return MGMT_LTK_UNAUTHENTICATED; @@ -8687,7 +8695,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) ev.key.rand = key->rand; if (key->type == SMP_LTK) - ev.key.master = 1; + ev.key.initiator = 1; /* Make sure we copy only the significant bytes based on the * encryption key size, and set the rest of the value to zeroes. @@ -8767,15 +8775,19 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, } void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, - u32 flags, u8 *name, u8 name_len) + u8 *name, u8 name_len) { char buf[512]; struct mgmt_ev_device_connected *ev = (void *) buf; u16 eir_len = 0; + u32 flags = 0; bacpy(&ev->addr.bdaddr, &conn->dst); ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type); + if (conn->out) + flags |= MGMT_DEV_FOUND_INITIATED_CONN; + ev->flags = __cpu_to_le32(flags); /* We must ensure that the EIR Data fields are ordered and diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 1deb0ca7a929..6ef701c27da4 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -146,7 +146,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, const u16 type = le16_to_cpu(TO_TLV(buffer)->type); if (buffer_left < exp_len) { - bt_dev_warn(hdev, "invalid len left %d, exp >= %d", + bt_dev_warn(hdev, "invalid len left %u, exp >= %u", buffer_left, exp_len); return mgmt_cmd_status(sk, hdev->id, @@ -198,7 +198,7 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, } if (exp_type_len && len != exp_type_len) { - bt_dev_warn(hdev, "invalid length %d, exp %zu for type %d", + bt_dev_warn(hdev, "invalid length %d, exp %zu for type %u", len, exp_type_len, type); return mgmt_cmd_status(sk, hdev->id, diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index e28f15439ce4..b4bfae41e8a5 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -34,12 +34,12 @@ struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; - __u8 pattern[0]; + __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; - __u8 data[0]; + __u8 data[]; }; struct msft_cp_le_monitor_advertisement { @@ -49,7 +49,7 @@ struct msft_cp_le_monitor_advertisement { __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; - __u8 data[0]; + __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { @@ -311,7 +311,7 @@ static void msft_le_monitor_advertisement_cb(struct hci_dev *hdev, monitor = idr_find(&hdev->adv_monitors_idr, msft->pending_add_handle); if (!monitor) { - bt_dev_err(hdev, "msft add advmon: monitor %d is not found!", + bt_dev_err(hdev, "msft add advmon: monitor %u is not found!", msft->pending_add_handle); status = HCI_ERROR_UNSPECIFIED; goto unlock; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index ae6f80730561..2c95bb58f901 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -70,7 +70,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) BT_DBG("dlc %p state %ld err %d", d, d->state, err); - spin_lock_bh(&sk->sk_lock.slock); + lock_sock(sk); if (err) sk->sk_err = err; @@ -91,7 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sk->sk_state_change(sk); } - spin_unlock_bh(&sk->sk_lock.slock); + release_sock(sk); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise @@ -974,7 +974,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * if (!parent) return 0; - bh_lock_sock(parent); + lock_sock(parent); /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { @@ -1001,7 +1001,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * result = 1; done: - bh_unlock_sock(parent); + release_sock(parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) parent->sk_state_change(parent); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index a58584949a95..ebd78fdbd6e8 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -198,20 +198,22 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev) hci_dev_put(hdev); } -static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) +static ssize_t address_show(struct device *tty_dev, + struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%pMR\n", &dev->dst); } -static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) +static ssize_t channel_show(struct device *tty_dev, + struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%d\n", dev->channel); } -static DEVICE_ATTR(address, 0444, show_address, NULL); -static DEVICE_ATTR(channel, 0444, show_channel, NULL); +static DEVICE_ATTR_RO(address); +static DEVICE_ATTR_RO(channel); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) @@ -807,7 +809,7 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in return sent; } -static int rfcomm_tty_write_room(struct tty_struct *tty) +static unsigned int rfcomm_tty_write_room(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; int room = 0; @@ -1010,7 +1012,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty) rfcomm_dlc_unthrottle(dev->dlc); } -static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) +static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; @@ -1125,9 +1127,10 @@ int __init rfcomm_init_ttys(void) { int error; - rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS); - if (!rfcomm_tty_driver) - return -ENOMEM; + rfcomm_tty_driver = tty_alloc_driver(RFCOMM_TTY_PORTS, + TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); + if (IS_ERR(rfcomm_tty_driver)) + return PTR_ERR(rfcomm_tty_driver); rfcomm_tty_driver->driver_name = "rfcomm"; rfcomm_tty_driver->name = "rfcomm"; @@ -1135,7 +1138,6 @@ int __init rfcomm_init_ttys(void) rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; - rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; rfcomm_tty_driver->init_termios = tty_std_termios; rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL; rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON; @@ -1144,7 +1146,7 @@ int __init rfcomm_init_ttys(void) error = tty_register_driver(rfcomm_tty_driver); if (error) { BT_ERR("Can't register RFCOMM TTY driver"); - put_tty_driver(rfcomm_tty_driver); + tty_driver_kref_put(rfcomm_tty_driver); return error; } @@ -1156,5 +1158,5 @@ int __init rfcomm_init_ttys(void) void rfcomm_cleanup_ttys(void) { tty_unregister_driver(rfcomm_tty_driver); - put_tty_driver(rfcomm_tty_driver); + tty_driver_kref_put(rfcomm_tty_driver); } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 3bd41563f118..98a881586512 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -48,6 +48,8 @@ struct sco_conn { spinlock_t lock; struct sock *sk; + struct delayed_work timeout_work; + unsigned int mtu; }; @@ -74,31 +76,47 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) -static void sco_sock_timeout(struct timer_list *t) +static void sco_sock_timeout(struct work_struct *work) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sco_conn *conn = container_of(work, struct sco_conn, + timeout_work.work); + struct sock *sk; + + sco_conn_lock(conn); + sk = conn->sk; + if (sk) + sock_hold(sk); + sco_conn_unlock(conn); + + if (!sk) + return; BT_DBG("sock %p state %d", sk, sk->sk_state); - bh_lock_sock(sk); + lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); - bh_unlock_sock(sk); - - sco_sock_kill(sk); + release_sock(sk); sock_put(sk); } static void sco_sock_set_timer(struct sock *sk, long timeout) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d", sk, sk->sk_state); - sk_stop_timer(sk, &sk->sk_timer); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ @@ -173,12 +191,14 @@ static void sco_conn_del(struct hci_conn *hcon, int err) if (sk) { sock_hold(sk); - bh_lock_sock(sk); + lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); - bh_unlock_sock(sk); - sco_sock_kill(sk); + release_sock(sk); sock_put(sk); + + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); } hcon->sco_data = NULL; @@ -193,6 +213,8 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, sco_pi(sk)->conn = conn; conn->sk = sk; + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); + if (parent) bt_accept_enqueue(parent, sk, true); } @@ -212,44 +234,32 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, return err; } -static int sco_connect(struct sock *sk) +static int sco_connect(struct hci_dev *hdev, struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; - struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); - hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) - return -EHOSTUNREACH; - - hci_dev_lock(hdev); - if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && - (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { - err = -EOPNOTSUPP; - goto done; - } + (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) + return -EOPNOTSUPP; hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting); - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto done; - } + if (IS_ERR(hcon)) + return PTR_ERR(hcon); conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); - err = -ENOMEM; - goto done; + return -ENOMEM; } /* Update source addr of the socket */ @@ -257,7 +267,7 @@ static int sco_connect(struct sock *sk) err = sco_chan_add(conn, sk, NULL); if (err) - goto done; + return err; if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); @@ -267,9 +277,6 @@ static int sco_connect(struct sock *sk) sco_sock_set_timer(sk, sk->sk_sndtimeo); } -done: - hci_dev_unlock(hdev); - hci_dev_put(hdev); return err; } @@ -310,7 +317,7 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) if (!sk) goto drop; - BT_DBG("sk %p len %d", sk, skb->len); + BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; @@ -394,8 +401,7 @@ static void sco_sock_cleanup_listen(struct sock *parent) */ static void sco_sock_kill(struct sock *sk) { - if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || - sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); @@ -443,11 +449,10 @@ static void __sco_sock_close(struct sock *sk) /* Must be called on unlocked socket. */ static void sco_sock_close(struct sock *sk) { - sco_sock_clear_timer(sk); lock_sock(sk); + sco_sock_clear_timer(sk); __sco_sock_close(sk); release_sock(sk); - sco_sock_kill(sk); } static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, @@ -500,8 +505,6 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); - bt_sock_link(&sco_sk_list, sk); return sk; } @@ -566,6 +569,7 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; + struct hci_dev *hdev; int err; BT_DBG("sk %p", sk); @@ -580,12 +584,19 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; + hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) + return -EHOSTUNREACH; + hci_dev_lock(hdev); + lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); - err = sco_connect(sk); + err = sco_connect(hdev, sk); + hci_dev_unlock(hdev); + hci_dev_put(hdev); if (err) goto done; @@ -773,6 +784,11 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; + default: + /* use CVSD settings as fallback */ + cp.max_latency = cpu_to_le16(0xffff); + cp.retrans_effort = 0xff; + break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, @@ -905,7 +921,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, opts.mtu = sco_pi(sk)->conn->mtu; - BT_DBG("mtu %d", opts.mtu); + BT_DBG("mtu %u", opts.mtu); len = min_t(unsigned int, len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) @@ -1083,11 +1099,11 @@ static void sco_conn_ready(struct sco_conn *conn) BT_DBG("conn %p", conn); if (sk) { + lock_sock(sk); sco_sock_clear_timer(sk); - bh_lock_sock(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); - bh_unlock_sock(sk); + release_sock(sk); } else { sco_conn_lock(conn); @@ -1102,12 +1118,12 @@ static void sco_conn_ready(struct sco_conn *conn) return; } - bh_lock_sock(parent); + lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { - bh_unlock_sock(parent); + release_sock(parent); sco_conn_unlock(conn); return; } @@ -1128,7 +1144,7 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); - bh_unlock_sock(parent); + release_sock(parent); sco_conn_unlock(conn); } @@ -1167,7 +1183,7 @@ static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) return; - BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); + BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; @@ -1196,7 +1212,7 @@ void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) if (!conn) goto drop; - BT_DBG("conn %p len %d", conn, skb->len); + BT_DBG("conn %p len %u", conn, skb->len); if (skb->len) { sco_recv_frame(conn, skb); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 7dd51da73845..11f853d0500f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -40,7 +40,7 @@ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want - * accidentially in dmesg, i.e. the values of the various crypto keys + * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG @@ -111,9 +111,9 @@ struct smp_chan { u8 id_addr_type; u8 irk[16]; struct smp_csrk *csrk; - struct smp_csrk *slave_csrk; + struct smp_csrk *responder_csrk; struct smp_ltk *ltk; - struct smp_ltk *slave_ltk; + struct smp_ltk *responder_ltk; struct smp_irk *remote_irk; u8 *link_key; unsigned long flags; @@ -560,7 +560,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) return err; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; @@ -753,7 +753,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) mgmt_smp_complete(hcon, complete); kfree_sensitive(smp->csrk); - kfree_sensitive(smp->slave_csrk); + kfree_sensitive(smp->responder_csrk); kfree_sensitive(smp->link_key); crypto_free_shash(smp->tfm_cmac); @@ -776,9 +776,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn) kfree_rcu(smp->ltk, rcu); } - if (smp->slave_ltk) { - list_del_rcu(&smp->slave_ltk->list); - kfree_rcu(smp->slave_ltk, rcu); + if (smp->responder_ltk) { + list_del_rcu(&smp->responder_ltk->list); + kfree_rcu(smp->responder_ltk, rcu); } if (smp->remote_irk) { @@ -859,7 +859,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); - bt_dev_dbg(hcon->hdev, "auth:%d lcl:%d rem:%d", auth, local_io, + bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io, remote_io); /* If neither side wants MITM, either "just" confirm an incoming @@ -909,8 +909,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, hcon->pending_sec_level = BT_SECURITY_HIGH; } - /* If both devices have Keyoard-Display I/O, the master - * Confirms and the slave Enters the passkey. + /* If both devices have Keyboard-Display I/O, the initiator + * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { if (hcon->role == HCI_ROLE_MASTER) @@ -925,7 +925,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); - bt_dev_dbg(hcon->hdev, "PassKey: %d", passkey); + bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } @@ -979,7 +979,7 @@ static u8 smp_random(struct smp_chan *smp) int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, - conn->hcon->out ? "master" : "slave"); + conn->hcon->out ? "initiator" : "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, @@ -1021,8 +1021,8 @@ static u8 smp_random(struct smp_chan *smp) else auth = 0; - /* Even though there's no _SLAVE suffix this is the - * slave STK we're adding for later lookup (the master + /* Even though there's no _RESPONDER suffix this is the + * responder STK we're adding for later lookup (the initiator * STK never needs to be stored). */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, @@ -1077,10 +1077,10 @@ static void smp_notify_keys(struct l2cap_conn *conn) mgmt_new_csrk(hdev, smp->csrk, persistent); } - if (smp->slave_csrk) { - smp->slave_csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->slave_csrk, persistent); + if (smp->responder_csrk) { + smp->responder_csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { @@ -1089,10 +1089,10 @@ static void smp_notify_keys(struct l2cap_conn *conn) mgmt_new_ltk(hdev, smp->ltk, persistent); } - if (smp->slave_ltk) { - smp->slave_ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->slave_ltk, persistent); + if (smp->responder_ltk) { + smp->responder_ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->responder_ltk, persistent); } if (smp->link_key) { @@ -1272,7 +1272,7 @@ static void smp_distribute_keys(struct smp_chan *smp) if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; - struct smp_cmd_master_ident ident; + struct smp_cmd_initiator_ident ident; struct smp_ltk *ltk; u8 authenticated; __le16 ediv; @@ -1293,14 +1293,15 @@ static void smp_distribute_keys(struct smp_chan *smp) authenticated = hcon->sec_level == BT_SECURITY_HIGH; ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, - SMP_LTK_SLAVE, authenticated, enc.ltk, + SMP_LTK_RESPONDER, authenticated, enc.ltk, smp->enc_key_size, ediv, rand); - smp->slave_ltk = ltk; + smp->responder_ltk = ltk; ident.ediv = ediv; ident.rand = rand; - smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); + smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident), + &ident); *keydist &= ~SMP_DIST_ENC_KEY; } @@ -1343,7 +1344,7 @@ static void smp_distribute_keys(struct smp_chan *smp) csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } - smp->slave_csrk = csrk; + smp->responder_csrk = csrk; smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); @@ -1654,7 +1655,7 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); - bt_dev_dbg(conn->hcon->hdev, "PassKey: %d", value); + bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: @@ -1902,7 +1903,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that - * we didn't accidentially generate a debug key. + * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; @@ -2048,7 +2049,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp) struct smp_cmd_pairing *req, *rsp; u8 auth; - /* The issue is only observed when we're in slave role */ + /* The issue is only observed when we're in responder role */ if (hcon->out) return SMP_UNSPECIFIED; @@ -2084,7 +2085,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; - bt_dev_dbg(hdev, "conn %p %s", conn, hcon->out ? "master" : "slave"); + bt_dev_dbg(hdev, "conn %p %s", conn, + hcon->out ? "initiator" : "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; @@ -2251,7 +2253,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; - /* We never store STKs for master role, so clear this flag */ + /* We never store STKs for initiator role, so clear this flag */ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); return true; @@ -2467,7 +2469,7 @@ int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, /* Set keys to NULL to make sure smp_failure() does not try to * remove and free already invalidated rcu list entries. */ smp->ltk = NULL; - smp->slave_ltk = NULL; + smp->responder_ltk = NULL; smp->remote_irk = NULL; if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) @@ -2503,7 +2505,7 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) return SMP_INVALID_PARAMS; } - SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); + SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT); skb_pull(skb, sizeof(*rp)); @@ -2512,9 +2514,9 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) +static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_cmd_master_ident *rp = (void *) skb->data; + struct smp_cmd_initiator_ident *rp = (void *)skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; @@ -2913,7 +2915,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } - /* Slave sends DHKey check as response to master */ + /* Responder sends DHKey check as response to initiator */ sc_dhkey_check(smp); } @@ -3000,8 +3002,8 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) reason = smp_cmd_encrypt_info(conn, skb); break; - case SMP_CMD_MASTER_IDENT: - reason = smp_cmd_master_ident(conn, skb); + case SMP_CMD_INITIATOR_IDENT: + reason = smp_cmd_initiator_ident(conn, skb); break; case SMP_CMD_IDENT_INFO: @@ -3081,7 +3083,7 @@ static void bredr_pairing(struct l2cap_chan *chan) if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; - /* Only master may initiate SMP over BR/EDR */ + /* Only initiator may initiate SMP over BR/EDR */ if (hcon->role != HCI_ROLE_MASTER) return; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index fc35a8bf358e..87a59ec2c9f0 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -79,8 +79,8 @@ struct smp_cmd_encrypt_info { __u8 ltk[16]; } __packed; -#define SMP_CMD_MASTER_IDENT 0x07 -struct smp_cmd_master_ident { +#define SMP_CMD_INITIATOR_IDENT 0x07 +struct smp_cmd_initiator_ident { __le16 ediv; __le64 rand; } __packed; @@ -146,7 +146,7 @@ struct smp_cmd_keypress_notify { enum { SMP_STK, SMP_LTK, - SMP_LTK_SLAVE, + SMP_LTK_RESPONDER, SMP_LTK_P256, SMP_LTK_P256_DEBUG, }; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a5d72c48fb66..2eb0e55ef54d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -7,6 +7,7 @@ #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> +#include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> @@ -15,6 +16,7 @@ #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> +#include <net/xdp.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> @@ -87,17 +89,19 @@ reset: static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -106,22 +110,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, repeat = 1; bpf_test_timer_enter(&t); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; - + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else - *retval = BPF_PROG_RUN(prog, ctx); - - bpf_cgroup_storage_unset(); + *retval = bpf_prog_run(prog, ctx); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } @@ -326,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data) struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); - info->retval = BPF_PROG_RUN(info->prog, info->ctx); + info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } @@ -409,7 +410,7 @@ static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) return ERR_PTR(-ENOMEM); if (data_in) { - err = bpf_check_uarg_tail_zero(data_in, max_size, size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); @@ -687,6 +688,64 @@ out: return ret; } +static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) +{ + unsigned int ingress_ifindex, rx_queue_index; + struct netdev_rx_queue *rxqueue; + struct net_device *device; + + if (!xdp_md) + return 0; + + if (xdp_md->egress_ifindex != 0) + return -EINVAL; + + ingress_ifindex = xdp_md->ingress_ifindex; + rx_queue_index = xdp_md->rx_queue_index; + + if (!ingress_ifindex && rx_queue_index) + return -EINVAL; + + if (ingress_ifindex) { + device = dev_get_by_index(current->nsproxy->net_ns, + ingress_ifindex); + if (!device) + return -ENODEV; + + if (rx_queue_index >= device->real_num_rx_queues) + goto free_dev; + + rxqueue = __netif_get_rx_queue(device, rx_queue_index); + + if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) + goto free_dev; + + xdp->rxq = &rxqueue->xdp_rxq; + /* The device is now tracked in the xdp->rxq for later + * dev_put() + */ + } + + xdp->data = xdp->data_meta + xdp_md->data; + return 0; + +free_dev: + dev_put(device); + return -EINVAL; +} + +static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) +{ + if (!xdp_md) + return; + + xdp_md->data = xdp->data - xdp->data_meta; + xdp_md->data_end = xdp->data_end - xdp->data_meta; + + if (xdp_md->ingress_ifindex) + dev_put(xdp->rxq->dev); +} + int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -697,35 +756,73 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + struct xdp_md *ctx; u32 max_data_sz; void *data; - int ret; + int ret = -EINVAL; - if (kattr->test.ctx_in || kattr->test.ctx_out) + if (prog->expected_attach_type == BPF_XDP_DEVMAP || + prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; + ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + /* There can't be user provided data before the meta data */ + if (ctx->data_meta || ctx->data_end != size || + ctx->data > ctx->data_end || + unlikely(xdp_metalen_invalid(ctx->data))) + goto free_ctx; + /* Meta data is allocated from the headroom */ + headroom -= ctx->data; + } + /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto free_ctx; + } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); + ret = xdp_convert_md_to_buff(ctx, &xdp); + if (ret) + goto free_data; + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); + /* We convert the xdp_buff back to an xdp_md before checking the return + * code so the reference count of any held netdevice will be decremented + * even if the test run failed. + */ + xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; - if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) - size = xdp.data_end - xdp.data; - ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); + + if (xdp.data_meta != data + headroom || + xdp.data_end != xdp.data_meta + size) + size = xdp.data_end - xdp.data_meta; + + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval, + duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct xdp_md)); + out: bpf_prog_change_xdp(prog, NULL); +free_data: kfree(data); +free_ctx: + kfree(ctx); return ret; } @@ -892,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; - retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); bpf_test_timer_leave(&t); @@ -918,3 +1015,49 @@ out: kfree(user_ctx); return ret; } + +int bpf_prog_test_run_syscall(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + void *ctx = NULL; + u32 retval; + int err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat || kattr->test.flags) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > U16_MAX) + return -EINVAL; + + if (ctx_size_in) { + ctx = kzalloc(ctx_size_in, GFP_USER); + if (!ctx) + return -ENOMEM; + if (copy_from_user(ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } + + rcu_read_lock_trace(); + retval = bpf_prog_run_pin_on_cpu(prog, ctx); + rcu_read_unlock_trace(); + + if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { + err = -EFAULT; + goto out; + } + if (ctx_size_in) + if (copy_to_user(ctx_in, ctx, ctx_size_in)) + err = -EFAULT; +out: + kfree(ctx); + return err; +} diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 05e1cfc1e5cd..291a92546246 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -57,7 +57,7 @@ int main(void) { debug_f = fopen("/dev/kmsg", "w"); setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "Started bpfilter\n"); + fprintf(debug_f, "<5>Started bpfilter\n"); loop(); fclose(debug_f); return 0; diff --git a/net/bridge/br.c b/net/bridge/br.c index ef743f94254d..d3a32c6813e0 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -201,6 +201,48 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* called under rtnl_mutex */ +static int br_switchdev_blocking_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + struct switchdev_notifier_brport_info *brport_info; + const struct switchdev_brport *b; + struct net_bridge_port *p; + int err = NOTIFY_DONE; + + p = br_port_get_rtnl(dev); + if (!p) + goto out; + + switch (event) { + case SWITCHDEV_BRPORT_OFFLOADED: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_offload(p, b->dev, b->ctx, + b->atomic_nb, b->blocking_nb, + b->tx_fwd_offload, extack); + err = notifier_from_errno(err); + break; + case SWITCHDEV_BRPORT_UNOFFLOADED: + brport_info = ptr; + b = &brport_info->brport; + + br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, + b->blocking_nb); + break; + } + +out: + return err; +} + +static struct notifier_block br_switchdev_blocking_notifier = { + .notifier_call = br_switchdev_blocking_event, +}; + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -214,17 +256,22 @@ static struct notifier_block br_switchdev_notifier = { int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { + int err = 0; + switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + err = br_multicast_toggle_vlan_snooping(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } - return 0; + return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) @@ -232,6 +279,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -348,11 +397,15 @@ static int __init br_init(void) if (err) goto err_out4; - err = br_netlink_init(); + err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; - brioctl_set(br_ioctl_deviceless_stub); + err = br_netlink_init(); + if (err) + goto err_out6; + + brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; @@ -366,6 +419,8 @@ static int __init br_init(void) return 0; +err_out6: + unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: @@ -385,6 +440,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); + unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index 001064f7583d..a3c755d0a09d 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -142,7 +142,7 @@ static void br_cfm_notify(int event, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_CFM_STATUS; - return br_info_notify(event, port->br, NULL, filter); + br_info_notify(event, port->br, NULL, filter); } static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e8b626cc6bfd..8d6bab244c4a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -27,11 +27,14 @@ EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); + struct net_bridge_mcast *brmctx = &br->multicast_ctx; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; + struct net_bridge_vlan *vlan; const unsigned char *dest; u16 vid = 0; @@ -53,7 +56,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, + &state, &vlan)) goto out; if (IS_ENABLED(CONFIG_INET) && @@ -82,15 +86,15 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_flood(br, skb, BR_PKT_MULTICAST, false, true); goto out; } - if (br_multicast_rcv(br, NULL, skb, vid)) { + if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) { kfree_skb(skb); goto out; } - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) - br_multicast_flood(mdst, skb, false, true); + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) + br_multicast_flood(mdst, skb, brmctx, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { @@ -450,7 +454,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, .ndo_change_mtu = br_change_mtu, - .ndo_do_ioctl = br_dev_ioctl, + .ndo_siocdevprivate = br_dev_siocdevprivate, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = br_netpoll_setup, .ndo_netpoll_cleanup = br_netpoll_cleanup, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 698b79747d32..46812b659710 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -440,9 +440,14 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) if (!port) ret = 0; else { + const struct net_bridge_port *dst = NULL; + fdb = br_fdb_find_rcu(port->br, addr, 0); - ret = fdb && fdb->dst && fdb->dst->dev != dev && - fdb->dst->state == BR_STATE_FORWARDING; + if (fdb) + dst = READ_ONCE(fdb->dst); + + ret = dst && dst->dev != dev && + dst->state == BR_STATE_FORWARDING; } rcu_read_unlock(); @@ -509,7 +514,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (fdb) { memcpy(fdb->key.addr.addr, addr, ETH_ALEN); - fdb->dst = source; + WRITE_ONCE(fdb->dst, source); fdb->key.vlan_id = vid; fdb->flags = flags; fdb->updated = fdb->used = jiffies; @@ -600,10 +605,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst && + if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { - br_switchdev_fdb_notify(fdb, RTM_DELNEIGH); - fdb->dst = source; + br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH); + WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, @@ -650,6 +655,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, u32 portid, u32 seq, int type, unsigned int flags) { + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -665,7 +671,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_pad2 = 0; ndm->ndm_flags = 0; ndm->ndm_type = 0; - ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; + ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) @@ -726,10 +732,11 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } -static int br_fdb_replay_one(struct notifier_block *nb, - struct net_bridge_fdb_entry *fdb, - struct net_device *dev) +static int br_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, + const struct net_bridge_fdb_entry *fdb, + unsigned long action, const void *ctx) { + const struct net_bridge_port *p = READ_ONCE(fdb->dst); struct switchdev_notifier_fdb_info item; int err; @@ -737,35 +744,39 @@ static int br_fdb_replay_one(struct notifier_block *nb, item.vid = fdb->key.vlan_id; item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); - item.info.dev = dev; + item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); + item.info.dev = (!p || item.is_local) ? br->dev : p->dev; + item.info.ctx = ctx; - err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); + err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } -int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, +int br_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; + unsigned long action; int err = 0; - if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + if (!nb) + return 0; + + if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); + if (adding) + action = SWITCHDEV_FDB_ADD_TO_DEVICE; + else + action = SWITCHDEV_FDB_DEL_TO_DEVICE; + rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { - struct net_bridge_port *dst = READ_ONCE(fdb->dst); - struct net_device *dst_dev; - - dst_dev = dst ? dst->dev : br->dev; - if (dst_dev != br_dev && dst_dev != dev) - continue; - - err = br_fdb_replay_one(nb, fdb, dst_dev); + err = br_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } @@ -774,7 +785,6 @@ int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(br_fdb_replay); static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, @@ -785,7 +795,7 @@ static void fdb_notify(struct net_bridge *br, int err = -ENOBUFS; if (swdev_notify) - br_switchdev_fdb_notify(fdb, type); + br_switchdev_fdb_notify(br, fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) @@ -955,8 +965,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (flags & NLM_F_EXCL) return -EEXIST; - if (fdb->dst != source) { - fdb->dst = source; + if (READ_ONCE(fdb->dst) != source) { + WRITE_ONCE(fdb->dst, source); modified = true; } } @@ -1001,7 +1011,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], + struct netlink_ext_ack *extack) { int err = 0; @@ -1020,6 +1031,11 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { + if (!p && !(ndm->ndm_state & NUD_PERMANENT)) { + NL_SET_ERR_MSG_MOD(extack, + "FDB entry towards bridge must be permanent"); + return -EINVAL; + } err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); @@ -1092,9 +1108,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, + extack); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, + extack); if (err || !vg || !vg->num_vlans) goto out; @@ -1106,7 +1124,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, - nfea_tb); + nfea_tb, extack); if (err) goto out; } @@ -1123,7 +1141,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br, struct net_bridge_fdb_entry *fdb; fdb = br_fdb_find(br, addr, vlan); - if (!fdb || fdb->dst != p) + if (!fdb || READ_ONCE(fdb->dst) != p) return -ENOENT; fdb_delete(br, fdb, true); @@ -1263,6 +1281,10 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (swdev_notify) flags |= BIT(BR_FDB_ADDED_BY_USER); + + if (!p) + flags |= BIT(BR_FDB_LOCAL); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; @@ -1272,8 +1294,8 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, } else { fdb->updated = jiffies; - if (fdb->dst != p) { - fdb->dst = p; + if (READ_ONCE(fdb->dst) != p) { + WRITE_ONCE(fdb->dst, p); modified = true; } @@ -1289,6 +1311,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); + if (!p) + set_bit(BR_FDB_LOCAL, &fdb->flags); + if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6e9b049ae521..ec646656dbf1 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -48,6 +48,8 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_set_network_header(skb, depth); } + br_switchdev_frame_set_offload_fwd_mark(skb); + dev_queue_xmit(skb); return 0; @@ -76,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to, struct net *net; int br_hook; + /* Mark the skb for forwarding offload early so that br_handle_vlan() + * can know whether to pop the VLAN header on egress or keep it. + */ + nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) @@ -174,6 +181,8 @@ static struct net_bridge_port *maybe_deliver( if (!should_deliver(p, skb)) return prev; + nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); + if (!prev) goto out; @@ -267,19 +276,19 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; struct hlist_node *rp; - rp = rcu_dereference(hlist_first_rcu(&br->router_list)); + rp = br_multicast_get_first_rport_node(brmctx, skb); + if (mdst) { p = rcu_dereference(mdst->ports); - if (br_multicast_should_handle_mode(br, mdst->addr.proto) && + if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) && br_multicast_is_star_g(&mdst->addr)) allow_mode_include = false; } else { @@ -290,7 +299,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_port *port, *lport, *rport; lport = p ? p->key.port : NULL; - rport = hlist_entry_safe(rp, struct net_bridge_port, rlist); + rport = br_multicast_rport_from_node_skb(rp, skb); if ((unsigned long)lport > (unsigned long)rport) { port = lport; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f7d2f472ae24..4a02f8bb278a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -456,7 +456,7 @@ int br_add_bridge(struct net *net, const char *name) dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; - res = register_netdev(dev); + res = register_netdevice(dev); if (res) free_netdev(dev); return res; @@ -467,7 +467,6 @@ int br_del_bridge(struct net *net, const char *name) struct net_device *dev; int ret = 0; - rtnl_lock(); dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ @@ -485,7 +484,6 @@ int br_del_bridge(struct net *net, const char *name) else br_dev_delete(dev, NULL); - rtnl_unlock(); return ret; } @@ -562,7 +560,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; - bool changed_addr; + bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || @@ -616,6 +614,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, err = dev_set_allmulti(dev, 1); if (err) { + br_multicast_del_port(p); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } @@ -643,15 +642,24 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (err) goto err5; - err = nbp_switchdev_mark_set(p); - if (err) - goto err6; - dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); + if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { + /* When updating the port count we also update all ports' + * promiscuous mode. + * A port leaving promiscuous mode normally gets the bridge's + * fdb synced to the unicast filter (if supported), however, + * `br_port_clear_promisc` does not distinguish between + * non-promiscuous ports and *new* ports, so we need to + * sync explicitly here. + */ + fdb_synced = br_fdb_sync_static(br, p) == 0; + if (!fdb_synced) + netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); + } netdev_update_features(br->dev); @@ -671,13 +679,13 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) - goto err7; + goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err7; + goto err6; } spin_lock_bh(&br->lock); @@ -700,11 +708,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, return 0; -err7: +err6: + if (fdb_synced) + br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); -err6: netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; @@ -714,6 +723,7 @@ err4: err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: + br_multicast_del_port(p); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 8875e953ac53..b50382f957c1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -69,8 +69,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; + struct net_bridge_mcast *brmctx; + struct net_bridge_vlan *vlan; struct net_bridge *br; u16 vid = 0; u8 state; @@ -78,9 +81,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; + brmctx = &p->br->multicast_ctx; + pmctx = &p->multicast_ctx; state = p->state; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, - &state)) + &state, &vlan)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -98,7 +103,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; - if (br_multicast_rcv(br, p, skb, vid)) + if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } @@ -128,11 +133,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb switch (pkt_type) { case BR_PKT_MULTICAST: - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(br)) { + br_multicast_is_router(brmctx, skb)) { local_rcv = true; br->dev->stats.multicast++; } @@ -162,7 +167,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false); else - br_multicast_flood(mdst, skb, local_rcv, false); + br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) @@ -289,11 +294,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); p = br_port_get_rcu(skb->dev); - if (p->flags & BR_VLAN_TUNNEL) { - if (br_handle_ingress_vlan_tunnel(skb, p, - nbp_vlan_group_rcu(p))) - goto drop; - } + if (p->flags & BR_VLAN_TUNNEL) + br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 2db800fc27ca..793b0db9d9a3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -106,15 +106,32 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ -static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; + void __user *argp; int ret = -EOPNOTSUPP; - if (copy_from_user(args, rq->ifr_data, sizeof(args))) - return -EFAULT; + if (in_compat_syscall()) { + unsigned int cargs[4]; + + if (copy_from_user(cargs, data, sizeof(cargs))) + return -EFAULT; + + args[0] = cargs[0]; + args[1] = cargs[1]; + args[2] = cargs[2]; + args[3] = cargs[3]; + + argp = compat_ptr(args[1]); + } else { + if (copy_from_user(args, data, sizeof(args))) + return -EFAULT; + + argp = (void __user *)args[1]; + } switch (args[0]) { case BRCTL_ADD_IF: @@ -171,7 +188,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return -ENOMEM; get_port_ifindices(br, indices, num); - if (copy_to_user((void __user *)args[1], indices, num*sizeof(int))) + if (copy_to_user(argp, indices, num * sizeof(int))) num = -EFAULT; kfree(indices); return num; @@ -232,7 +249,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) rcu_read_unlock(); - if (copy_to_user((void __user *)args[1], &p, sizeof(p))) + if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; @@ -282,8 +299,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) } case BRCTL_GET_FDB_ENTRIES: - return get_fdb_entries(br, (void __user *)args[1], - args[2], args[3]); + return get_fdb_entries(br, argp, args[2], args[3]); } if (!ret) { @@ -320,7 +336,7 @@ static int old_deviceless(struct net *net, void __user *uarg) args[2] = get_bridge_ifindices(net, indices, args[2]); - ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int)) + ret = copy_to_user(uarg, indices, args[2]*sizeof(int)) ? -EFAULT : args[2]; kfree(indices); @@ -350,48 +366,47 @@ static int old_deviceless(struct net *net, void __user *uarg) return -EOPNOTSUPP; } -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) { + int ret = -EOPNOTSUPP; + + rtnl_lock(); + switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: - return old_deviceless(net, uarg); - + ret = old_deviceless(net, uarg); + break; case SIOCBRADDBR: case SIOCBRDELBR: { char buf[IFNAMSIZ]; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } - if (copy_from_user(buf, uarg, IFNAMSIZ)) - return -EFAULT; + if (copy_from_user(buf, uarg, IFNAMSIZ)) { + ret = -EFAULT; + break; + } buf[IFNAMSIZ-1] = 0; if (cmd == SIOCBRADDBR) - return br_add_bridge(net, buf); - - return br_del_bridge(net, buf); - } + ret = br_add_bridge(net, buf); + else + ret = br_del_bridge(net, buf); } - return -EOPNOTSUPP; -} - -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct net_bridge *br = netdev_priv(dev); - - switch (cmd) { - case SIOCDEVPRIVATE: - return old_dev_ioctl(dev, rq, cmd); - + break; case SIOCBRADDIF: case SIOCBRDELIF: - return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); - + ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); + break; } - br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd); - return -EOPNOTSUPP; + rtnl_unlock(); + + return ret; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 95fa4af0e8dd..0281453f7766 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -16,31 +16,109 @@ #include "br_private.h" -static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev) +static bool +br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, + unsigned long *timer) { - struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p; + *timer = br_timer_value(&pmctx->ip4_mc_router_timer); + return !hlist_unhashed(&pmctx->ip4_rlist); +} + +static bool +br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, + unsigned long *timer) +{ +#if IS_ENABLED(CONFIG_IPV6) + *timer = br_timer_value(&pmctx->ip6_mc_router_timer); + return !hlist_unhashed(&pmctx->ip6_rlist); +#else + *timer = 0; + return false; +#endif +} + +static size_t __br_rports_one_size(void) +{ + return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ + nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ + nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ +} + +size_t br_rports_size(const struct net_bridge_mcast *brmctx) +{ + struct net_bridge_mcast_port *pmctx; + size_t size = nla_total_size(0); /* MDBA_ROUTER */ + + rcu_read_lock(); + hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, + ip4_rlist) + size += __br_rports_one_size(); + +#if IS_ENABLED(CONFIG_IPV6) + hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, + ip6_rlist) + size += __br_rports_one_size(); +#endif + rcu_read_unlock(); + + return size; +} + +int br_rports_fill_info(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx) +{ + u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; + bool have_ip4_mc_rtr, have_ip6_mc_rtr; + unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; + struct net_bridge_port *p; - if (!br->multicast_router || hlist_empty(&br->router_list)) + if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; - hlist_for_each_entry_rcu(p, &br->router_list, rlist) { - if (!p) + list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { + struct net_bridge_mcast_port *pmctx; + + if (vid) { + struct net_bridge_vlan *v; + + v = br_vlan_find(nbp_vlan_group(p), vid); + if (!v) + continue; + pmctx = &v->port_mcast_ctx; + } else { + pmctx = &p->multicast_ctx; + } + + have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); + have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); + + if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; + if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, - br_timer_value(&p->multicast_router_timer)) || + max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, - p->multicast_router)) { + p->multicast_ctx.multicast_router) || + (have_ip4_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, + ip4_timer)) || + (have_ip6_mc_rtr && + nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, + ip6_timer)) || + (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } @@ -195,7 +273,7 @@ static int __mdb_fill_info(struct sk_buff *skb, switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) @@ -205,7 +283,7 @@ static int __mdb_fill_info(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) @@ -345,6 +423,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_EBRIDGE) { + struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; if (idx < s_idx) @@ -361,7 +440,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) bpm->ifindex = dev->ifindex; if (br_mdb_fill_info(skb, cb, dev) < 0) goto out; - if (br_rports_fill_info(skb, cb, dev) < 0) + if (br_rports_fill_info(skb, &br->multicast_ctx) < 0) goto out; cb->args[1] = 0; @@ -438,7 +517,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); - if (pg->key.port->br->multicast_igmp_version == 2) + if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; @@ -447,7 +526,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); - if (pg->key.port->br->multicast_mld_version == 1) + if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; @@ -522,19 +601,21 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, } static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, - struct switchdev_obj_port_mdb *mdb, + const struct switchdev_obj_port_mdb *mdb, + unsigned long action, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, + .ctx = ctx, }, .obj = &mdb->obj, }; int err; - err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } @@ -558,16 +639,21 @@ static int br_mdb_queue_one(struct list_head *mdb_list, } int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack) + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp; + const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; + unsigned long action; LIST_HEAD(mdb_list); int err = 0; ASSERT_RTNL(); + if (!nb) + return 0; + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; @@ -587,8 +673,8 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, rcu_read_lock(); hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { - struct net_bridge_port_group __rcu **pp; - struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu * const *pp; + const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_mdb_queue_one(&mdb_list, @@ -617,9 +703,14 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, rcu_read_unlock(); + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + list_for_each_entry(obj, &mdb_list, list) { err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), - extack); + action, ctx, extack); if (err) goto out_free_mdb; } @@ -632,7 +723,6 @@ out_free_mdb: return err; } -EXPORT_SYMBOL_GPL(br_mdb_replay); static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, @@ -727,12 +817,12 @@ errout: static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, - int ifindex, u32 pid, + int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { + struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; - struct nlattr *nest; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) @@ -746,8 +836,18 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb, if (!nest) goto cancel; - if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex)) + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto end; + if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { + nla_nest_cancel(skb, port_nest); goto end; + } + if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { + nla_nest_cancel(skb, port_nest); + goto end; + } + nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -763,23 +863,28 @@ cancel: static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) - + nla_total_size(sizeof(__u32)); + + nla_total_size(sizeof(__u32)) + + nla_total_size(sizeof(u16)); } -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; + u16 vid; - ifindex = port ? port->dev->ifindex : 0; + ifindex = pmctx ? pmctx->port->dev->ifindex : 0; + vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : + 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; - err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF); + err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, + NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; @@ -950,14 +1055,47 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static struct net_bridge_mcast * +__br_mdb_choose_context(struct net_bridge *br, + const struct br_mdb_entry *entry, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mcast *brmctx = NULL; + struct net_bridge_vlan *v; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + brmctx = &br->multicast_ctx; + goto out; + } + + if (!entry->vid) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); + goto out; + } + + v = br_vlan_find(br_vlan_group(br), entry->vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); + goto out; + } + if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); + goto out; + } + brmctx = &v->br_mcast_ctx; +out: + return brmctx; +} + static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_mdb_entry *entry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; - struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mcast *brmctx; struct br_ip group, star_group; unsigned long now = jiffies; unsigned char flags = 0; @@ -966,6 +1104,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + brmctx = __br_mdb_choose_context(br, entry, extack); + if (!brmctx) + return -EINVAL; + /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ @@ -999,7 +1141,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EEXIST; } - br_multicast_host_join(mp, false); + br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; @@ -1030,14 +1172,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } rcu_assign_pointer(*pp, p); if (entry->state == MDB_TEMPORARY) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); /* if we are adding a new EXCLUDE port group (*,G) it needs to be also * added to all S,G entries for proper replication, if we are adding * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be * added to it for proper replication */ - if (br_multicast_should_handle_mode(br, group.proto)) { + if (br_multicast_should_handle_mode(brmctx, group.proto)) { switch (filter_mode) { case MCAST_EXCLUDE: br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index cd2b1e424e54..fd2de35ffb3c 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -204,6 +204,33 @@ static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); br_mrp_skb_common(skb, mrp); + + /* In case the node behaves as MRA then the Test frame needs to have + * an Option TLV which includes eventually a sub-option TLV that has + * the type AUTO_MGR + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + struct br_mrp_sub_option1_hdr *sub_opt = NULL; + struct br_mrp_tlv_hdr *sub_tlv = NULL; + struct br_mrp_oui_hdr *oui = NULL; + u8 length; + + length = sizeof(*sub_opt) + sizeof(*sub_tlv) + sizeof(oui) + + MRP_OPT_PADDING; + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_OPTION, length); + + oui = skb_put(skb, sizeof(*oui)); + memset(oui, 0x0, sizeof(*oui)); + sub_opt = skb_put(skb, sizeof(*sub_opt)); + memset(sub_opt, 0x0, sizeof(*sub_opt)); + + sub_tlv = skb_put(skb, sizeof(*sub_tlv)); + sub_tlv->type = BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR; + + /* 32 bit alligment shall be ensured therefore add 2 bytes */ + skb_put(skb, MRP_OPT_PADDING); + } + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); return skb; @@ -627,8 +654,7 @@ int br_mrp_set_ring_state(struct net_bridge *br, if (!mrp) return -EINVAL; - if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && - state->ring_state != BR_MRP_RING_STATE_CLOSED) + if (mrp->ring_state != state->ring_state) mrp->ring_transitions++; mrp->ring_state = state->ring_state; @@ -715,8 +741,7 @@ int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state) if (!mrp) return -EINVAL; - if (mrp->in_state == BR_MRP_IN_STATE_CLOSED && - state->in_state != BR_MRP_IN_STATE_CLOSED) + if (mrp->in_state != state->in_state) mrp->in_transitions++; mrp->in_state = state->in_state; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 226bb05c3b42..3523c8c7068f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -49,27 +49,30 @@ static const struct rhashtable_params br_sg_port_rht_params = { .automatic_shrinking = true, }; -static void br_multicast_start_querier(struct net_bridge *br, +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); -static void __del_port_router(struct net_bridge_port *p); +static void +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -77,6 +80,7 @@ __br_multicast_add_group(struct net_bridge *br, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); +static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, @@ -137,12 +141,14 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, } #endif -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge *br = brmctx->br; struct br_ip ip; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) @@ -155,7 +161,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; @@ -168,7 +174,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; @@ -187,6 +193,62 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return br_mdb_ip_get_rcu(br, &ip); } +/* IMPORTANT: this function must be used only when the contexts cannot be + * passed down (e.g. timer) and must be used for read-only purposes because + * the vlan snooping option can change, so it can return any context + * (non-vlan or vlan). Its initial intended purpose is to read timer values + * from the *current* context based on the option. At worst that could lead + * to inconsistent timers when the contexts are changed, i.e. src timer + * which needs to re-arm with a specific delay taken from the old context + */ +static struct net_bridge_mcast_port * +br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) +{ + struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; + struct net_bridge_vlan *vlan; + + lockdep_assert_held_once(&pg->key.port->br->multicast_lock); + + /* if vlan snooping is disabled use the port's multicast context */ + if (!pg->key.addr.vid || + !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + goto out; + + /* locking is tricky here, due to different rules for multicast and + * vlans we need to take rcu to find the vlan and make sure it has + * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under + * multicast_lock which must be already held here, so the vlan's pmctx + * can safely be used on return + */ + rcu_read_lock(); + vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid); + if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + pmctx = &vlan->port_mcast_ctx; + else + pmctx = NULL; + rcu_read_unlock(); +out: + return pmctx; +} + +/* when snooping we need to check if the contexts should be used + * in the following order: + * - if pmctx is non-NULL (port), check if it should be used + * - if pmctx is NULL (bridge), check if brmctx should be used + */ +static bool +br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, + const struct net_bridge_mcast_port *pmctx) +{ + if (!netif_running(brmctx->br->dev)) + return false; + + if (pmctx) + return !br_multicast_port_ctx_state_disabled(pmctx); + else + return !br_multicast_ctx_vlan_disabled(brmctx); +} + static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) @@ -200,20 +262,23 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } -static void __fwd_add_star_excl(struct net_bridge_port_group *pg, +static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; - struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; + struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; - if (br_sg_port_find(br, &sg_key)) + if (br_sg_port_find(brmctx->br, &sg_key)) return; - src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + src_pg = __br_multicast_add_group(brmctx, pmctx, + sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) @@ -253,6 +318,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; @@ -262,9 +328,13 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { @@ -281,7 +351,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: - __fwd_add_star_excl(pg, &sg_ip); + __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } @@ -374,7 +444,9 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; + struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; @@ -397,7 +469,12 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, if (br_sg_port_find(br, &sg_key)) continue; - src_pg = __br_multicast_add_group(br, pg->key.port, + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + continue; + brmctx = br_multicast_port_ctx_get_global(pmctx); + + src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); @@ -411,16 +488,23 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; + struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); + pmctx = br_multicast_pg_to_port_ctx(src->pg); + if (!pmctx) + return; + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; - sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, + + sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) @@ -689,7 +773,28 @@ static void br_multicast_gc(struct hlist_head *head) } } -static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, +static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct sk_buff *skb) +{ + struct net_bridge_vlan *vlan = NULL; + + if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) + vlan = pmctx->vlan; + else if (br_multicast_ctx_is_vlan(brmctx)) + vlan = brmctx->vlan; + + if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { + u16 vlan_proto; + + if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) + return; + __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); + } +} + +static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, @@ -711,11 +816,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { - lmqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + lmqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && @@ -731,19 +836,20 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); @@ -759,8 +865,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; - iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? - inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; + iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? + inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; @@ -772,12 +878,12 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; - switch (br->multicast_igmp_version) { + switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ih->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; @@ -787,11 +893,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; - ihv3->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ihv3->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; - ihv3->qqic = br->multicast_query_interval / HZ; + ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; @@ -834,7 +940,8 @@ out: } #if IS_ENABLED(CONFIG_IPV6) -static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, @@ -859,11 +966,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, u8 *hopopt; mld_hdr_size = sizeof(*mldq); - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { - llqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + llqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && @@ -879,20 +986,21 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); @@ -905,14 +1013,14 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; - if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, - &ip6h->saddr)) { + if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, + &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); @@ -930,10 +1038,10 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? - br->multicast_query_response_interval : - br->multicast_last_member_interval; + brmctx->multicast_query_response_interval : + brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; - switch (br->multicast_mld_version) { + switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; @@ -956,7 +1064,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); - mld2q->mld2q_qqic = br->multicast_query_interval / HZ; + mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; @@ -997,7 +1105,8 @@ out: } #endif -static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1010,7 +1119,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); - return br_ip4_multicast_alloc_query(br, pg, + return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, @@ -1025,7 +1134,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); - return br_ip6_multicast_alloc_query(br, pg, + return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, @@ -1203,7 +1312,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( return p; } -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; @@ -1216,7 +1326,7 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (br_group_is_l2(&mp->addr)) return; - mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); + mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) @@ -1232,8 +1342,8 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) } static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1245,29 +1355,28 @@ __br_multicast_add_group(struct net_bridge *br, struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); - if (!port) { - br_multicast_host_join(mp, true); + if (!pmctx) { + br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (br_port_group_equal(p, port, src)) + if (br_port_group_equal(p, pmctx->port, src)) goto found; - if ((unsigned long)p->key.port < (unsigned long)port) + if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, + p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); @@ -1276,18 +1385,19 @@ __br_multicast_add_group(struct net_bridge *br, rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; - br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); out: return p; } -static int br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1296,18 +1406,18 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port_group *pg; int err; - spin_lock(&br->multicast_lock); - pg = __br_multicast_add_group(br, port, group, src, filter_mode, + spin_lock(&brmctx->br->multicast_lock); + pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } -static int br_ip4_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, @@ -1325,13 +1435,13 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - igmpv2); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, @@ -1349,28 +1459,71 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - mldv1); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, mldv1); } #endif -static void br_multicast_router_expired(struct timer_list *t) +static bool br_multicast_rport_del(struct hlist_node *rlist) { - struct net_bridge_port *port = - from_timer(port, t, multicast_router_timer); - struct net_bridge *br = port->br; + if (hlist_unhashed(rlist)) + return false; + + hlist_del_init_rcu(rlist); + return true; +} + +static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_rport_del(&pmctx->ip4_rlist); +} + +static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + return br_multicast_rport_del(&pmctx->ip6_rlist); +#else + return false; +#endif +} + +static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, + struct timer_list *t, + struct hlist_node *rlist) +{ + struct net_bridge *br = pmctx->port->br; + bool del; spin_lock(&br->multicast_lock); - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&port->multicast_router_timer)) + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM || + timer_pending(t)) goto out; - __del_port_router(port); + del = br_multicast_rport_del(rlist); + br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } +static void br_ip4_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_mc_router_timer); + + br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_router_expired(struct timer_list *t) +{ + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_mc_router_timer); + + br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); +} +#endif + static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { @@ -1384,64 +1537,86 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -static void br_multicast_local_router_expired(struct timer_list *t) +static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, + struct timer_list *timer) { - struct net_bridge *br = from_timer(br, t, multicast_router_timer); - - spin_lock(&br->multicast_lock); - if (br->multicast_router == MDB_RTR_TYPE_DISABLED || - br->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&br->multicast_router_timer)) + spin_lock(&brmctx->br->multicast_lock); + if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + brmctx->multicast_router == MDB_RTR_TYPE_PERM || + br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx)) goto out; - br_mc_router_state_change(br, false); + br_mc_router_state_change(brmctx->br, false); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); +} + +static void br_ip4_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_mc_router_timer); + + br_multicast_local_router_expired(brmctx, t); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_local_router_expired(struct timer_list *t) +{ + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_mc_router_timer); + + br_multicast_local_router_expired(brmctx, t); } +#endif -static void br_multicast_querier_expired(struct net_bridge *br, +static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!netif_running(brmctx->br->dev) || + br_multicast_ctx_vlan_global_disabled(brmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; - br_multicast_start_querier(br, query); + br_multicast_start_querier(brmctx, query); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_other_query.timer); - br_multicast_querier_expired(br, &br->ip4_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_other_query.timer); - br_multicast_querier_expired(br, &br->ip6_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif -static void br_multicast_select_own_querier(struct net_bridge *br, +static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) - br->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; + brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else - br->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; + brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } -static void __br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1453,19 +1628,23 @@ static void __br_multicast_send_query(struct net_bridge *br, struct sk_buff *skb; u8 igmp_type; + if (!br_multicast_ctx_should_use(brmctx, pmctx) || + !br_multicast_ctx_matches_vlan_snooping(brmctx)) + return; + again_under_lmqt: - skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs, - over_lmqt, sflag, &igmp_type, + skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, + with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; - if (port) { - skb->dev = port->dev; - br_multicast_count(br, port, skb, igmp_type, + if (pmctx) { + skb->dev = pmctx->port->dev; + br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, - dev_net(port->dev), NULL, skb, NULL, skb->dev, + dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { @@ -1473,35 +1652,64 @@ again_under_lmqt: goto again_under_lmqt; } } else { - br_multicast_select_own_querier(br, group, skb); - br_multicast_count(br, port, skb, igmp_type, + br_multicast_select_own_querier(brmctx, group, skb); + br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } -static void br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_read_querier(const struct bridge_mcast_querier *querier, + struct bridge_mcast_querier *dest) +{ + unsigned int seq; + + memset(dest, 0, sizeof(*dest)); + do { + seq = read_seqcount_begin(&querier->seq); + dest->port_ifidx = querier->port_ifidx; + memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip)); + } while (read_seqcount_retry(&querier->seq, seq)); +} + +static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, + struct bridge_mcast_querier *querier, + int ifindex, + struct br_ip *saddr) +{ + lockdep_assert_held_once(&brmctx->br->multicast_lock); + + write_seqcount_begin(&querier->seq); + querier->port_ifidx = ifindex; + memcpy(&querier->addr, saddr, sizeof(*saddr)); + write_seqcount_end(&querier->seq); +} + +static void br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; + struct bridge_mcast_querier *querier; struct br_ip br_group; unsigned long time; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED) || - !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + if (!br_multicast_ctx_should_use(brmctx, pmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || + !brmctx->multicast_querier) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); - if (port ? (own_query == &port->ip4_own_query) : - (own_query == &br->ip4_own_query)) { - other_query = &br->ip4_other_query; + if (pmctx ? (own_query == &pmctx->ip4_own_query) : + (own_query == &brmctx->ip4_own_query)) { + querier = &brmctx->ip4_querier; + other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { - other_query = &br->ip6_other_query; + querier = &brmctx->ip6_querier; + other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } @@ -1509,31 +1717,39 @@ static void br_multicast_send_query(struct net_bridge *br, if (!other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0, - NULL); + /* we're about to select ourselves as querier */ + if (!pmctx && querier->port_ifidx) { + struct br_ip zeroip = {}; + + br_multicast_update_querier(brmctx, querier, 0, &zeroip); + } + + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, + 0, NULL); time = jiffies; - time += own_query->startup_sent < br->multicast_startup_query_count ? - br->multicast_startup_query_interval : - br->multicast_query_interval; + time += own_query->startup_sent < brmctx->multicast_startup_query_count ? + brmctx->multicast_startup_query_interval : + brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void -br_multicast_port_query_expired(struct net_bridge_port *port, +br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; - if (query->startup_sent < br->multicast_startup_query_count) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; - br_multicast_send_query(port->br, port, query); + br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); @@ -1541,17 +1757,19 @@ out: static void br_ip4_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_own_query.timer); - br_multicast_port_query_expired(port, &port->ip4_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_own_query.timer); - br_multicast_port_query_expired(port, &port->ip6_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif @@ -1560,19 +1778,27 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED) || - !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + goto out; + + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + goto out; + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (!brmctx->multicast_querier) goto out; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) @@ -1580,15 +1806,15 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + - br->multicast_last_member_interval); + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } @@ -1606,21 +1832,40 @@ static int br_mc_disabled_update(struct net_device *dev, bool value, return switchdev_port_attr_set(dev, &attr, extack); } +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) +{ + pmctx->port = port; + pmctx->vlan = vlan; + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + timer_setup(&pmctx->ip4_mc_router_timer, + br_ip4_multicast_router_expired, 0); + timer_setup(&pmctx->ip4_own_query.timer, + br_ip4_multicast_port_query_expired, 0); +#if IS_ENABLED(CONFIG_IPV6) + timer_setup(&pmctx->ip6_mc_router_timer, + br_ip6_multicast_router_expired, 0); + timer_setup(&pmctx->ip6_own_query.timer, + br_ip6_multicast_port_query_expired, 0); +#endif +} + +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&pmctx->ip6_mc_router_timer); +#endif + del_timer_sync(&pmctx->ip4_mc_router_timer); +} + int br_multicast_add_port(struct net_bridge_port *port) { int err; - port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; + br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); - timer_setup(&port->multicast_router_timer, - br_multicast_router_expired, 0); - timer_setup(&port->ip4_own_query.timer, - br_ip4_multicast_port_query_expired, 0); -#if IS_ENABLED(CONFIG_IPV6) - timer_setup(&port->ip6_own_query.timer, - br_ip6_multicast_port_query_expired, 0); -#endif err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), @@ -1649,7 +1894,7 @@ void br_multicast_del_port(struct net_bridge_port *port) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); - del_timer_sync(&port->multicast_router_timer); + br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } @@ -1662,50 +1907,63 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query) mod_timer(&query->timer, jiffies); } -static void __br_multicast_enable_port(struct net_bridge_port *port) +static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + !netif_running(br->dev)) return; - br_multicast_enable(&port->ip4_own_query); + br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(&pmctx->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM && - hlist_unhashed(&port->rlist)) - br_multicast_add_router(br, port); + if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { + br_ip4_multicast_add_router(brmctx, pmctx); + br_ip6_multicast_add_router(brmctx, pmctx); + } } void br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; - spin_lock(&br->multicast_lock); - __br_multicast_enable_port(port); - spin_unlock(&br->multicast_lock); + spin_lock_bh(&br->multicast_lock); + __br_multicast_enable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&br->multicast_lock); } -void br_multicast_disable_port(struct net_bridge_port *port) +static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; - - spin_lock(&br->multicast_lock); - hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) - br_multicast_find_del_pg(br, pg); - - __del_port_router(port); - - del_timer(&port->multicast_router_timer); - del_timer(&port->ip4_own_query.timer); + bool del = false; + + hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && + (!br_multicast_port_ctx_is_vlan(pmctx) || + pg->key.addr.vid == pmctx->vlan->vid)) + br_multicast_find_del_pg(pmctx->port->br, pg); + + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del_timer(&pmctx->ip4_own_query.timer); + del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&port->ip6_own_query.timer); + del_timer(&pmctx->ip6_mc_router_timer); + del_timer(&pmctx->ip6_own_query.timer); #endif - spin_unlock(&br->multicast_lock); + br_multicast_rport_del_notify(pmctx, del); +} + +void br_multicast_disable_port(struct net_bridge_port *port) +{ + spin_lock_bh(&port->br->multicast_lock); + __br_multicast_disable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&port->br->multicast_lock); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -1730,31 +1988,32 @@ static void __grp_src_mod_timer(struct net_bridge_group_src *src, br_multicast_fwd_src_handle(src); } -static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; - u32 lmqc = br->multicast_last_member_count; + u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - lmqt = now + br_multicast_lmqt(br); + lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; @@ -1763,41 +2022,42 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) } } - if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) || + if (!brmctx->multicast_querier || !other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); - lmi = now + br->multicast_last_member_interval; + lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } -static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; unsigned long now = jiffies, lmi; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) { - lmi = now + br->multicast_last_member_interval; - pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + lmi = now + brmctx->multicast_last_member_interval; + pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) @@ -1806,8 +2066,8 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || - time_after(pg->timer.expires, now + br_multicast_lmqt(br)))) - mod_timer(&pg->timer, now + br_multicast_lmqt(br)); + time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) + mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions @@ -1815,11 +2075,11 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ -static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1838,10 +2098,11 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; return changed; @@ -1852,7 +2113,8 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a * Delete (A-B) * Group Timer=GMI */ -static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -1876,7 +2138,8 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); } @@ -1887,11 +2150,11 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, * Delete (Y-A) * Group Timer=GMI */ -static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1912,13 +2175,14 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, - now + br_multicast_gmi(br)); + now + br_multicast_gmi(brmctx)); changed = true; } } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) @@ -1927,28 +2191,28 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, return changed; } -static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_isexc_incl(pg, h_addr, srcs, nsrcs, addr_size, + __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_isexc_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -1957,11 +2221,12 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ -static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -1985,14 +2250,15 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2002,11 +2268,12 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,X-A) * Send Q(G) */ -static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -2033,21 +2300,24 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); - __grp_send_query_and_rexmit(pg); + __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toin(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2055,12 +2325,12 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_toin_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_toin_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2082,7 +2352,9 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A*B) * Group Timer=GMI */ -static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2109,11 +2381,12 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions @@ -2123,7 +2396,9 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A-Y) * Group Timer=GMI */ -static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2155,39 +2430,41 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toex(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_toex_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_toex_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -2195,7 +2472,9 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ -static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2217,11 +2496,12 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2230,7 +2510,9 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ -static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2259,28 +2541,31 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_block(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_block(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_block_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_block_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2315,12 +2600,12 @@ br_multicast_find_port(struct net_bridge_mdb_entry *mp, return NULL; } -static int br_ip4_multicast_igmp3_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool igmpv2 = br->multicast_igmp_version == 2; + bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; @@ -2367,25 +2652,29 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { - if (!port || igmpv2) { - br_ip4_multicast_leave_group(br, port, group, vid, src); + if (!pmctx || igmpv2) { + br_ip4_multicast_leave_group(brmctx, pmctx, + group, vid, src); continue; } } else { - err = br_ip4_multicast_add_group(br, port, group, vid, - src, igmpv2); + err = br_ip4_multicast_add_group(brmctx, pmctx, group, + vid, src, igmpv2); if (err) break; } - if (!port || igmpv2) + if (!pmctx || igmpv2) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip4_get(br, group, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ @@ -2393,51 +2682,57 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, grec->grec_src, + changed = br_multicast_isexc(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, grec->grec_src, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, grec->grec_src, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, grec->grec_src, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_mld2_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool mldv1 = br->multicast_mld_version == 1; + bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; + struct mld2_report *mld2r; const unsigned char *src; - struct icmp6hdr *icmp6h; struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; @@ -2445,12 +2740,12 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, int i, len, num; int err = 0; - if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h))) + if (!ipv6_mc_may_pull(skb, sizeof(*mld2r))) return -EINVAL; - icmp6h = icmp6_hdr(skb); - num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = skb_transport_offset(skb) + sizeof(*icmp6h); + mld2r = (struct mld2_report *)icmp6_hdr(skb); + num = ntohs(mld2r->mld2r_ngrec); + len = skb_transport_offset(skb) + sizeof(*mld2r); for (i = 0; i < num; i++) { __be16 *_nsrcs, __nsrcs; @@ -2493,153 +2788,243 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { - if (!port || mldv1) { - br_ip6_multicast_leave_group(br, port, + if (!pmctx || mldv1) { + br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { - err = br_ip6_multicast_add_group(br, port, + err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } - if (!port || mldv1) + if (!pmctx || mldv1) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, + changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #endif -static bool br_ip4_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - __be32 saddr) +static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct br_ip *saddr) { - if (!timer_pending(&br->ip4_own_query.timer) && - !timer_pending(&br->ip4_other_query.timer)) - goto update; + int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0; + struct timer_list *own_timer, *other_timer; + struct bridge_mcast_querier *querier; - if (!br->ip4_querier.addr.src.ip4) - goto update; + switch (saddr->proto) { + case htons(ETH_P_IP): + querier = &brmctx->ip4_querier; + own_timer = &brmctx->ip4_own_query.timer; + other_timer = &brmctx->ip4_other_query.timer; + if (!querier->addr.src.ip4 || + ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4)) + goto update; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + querier = &brmctx->ip6_querier; + own_timer = &brmctx->ip6_own_query.timer; + other_timer = &brmctx->ip6_other_query.timer; + if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0) + goto update; + break; +#endif + default: + return false; + } - if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.src.ip4)) + if (!timer_pending(own_timer) && !timer_pending(other_timer)) goto update; return false; update: - br->ip4_querier.addr.src.ip4 = saddr; - - /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip4_querier.port, port); + br_multicast_update_querier(brmctx, querier, port_ifidx, saddr); return true; } -#if IS_ENABLED(CONFIG_IPV6) -static bool br_ip6_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - struct in6_addr *saddr) +static struct net_bridge_port * +__br_multicast_get_querier_port(struct net_bridge *br, + const struct bridge_mcast_querier *querier) { - if (!timer_pending(&br->ip6_own_query.timer) && - !timer_pending(&br->ip6_other_query.timer)) - goto update; - - if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.src.ip6) <= 0) - goto update; - - return false; + int port_ifidx = READ_ONCE(querier->port_ifidx); + struct net_bridge_port *p; + struct net_device *dev; -update: - br->ip6_querier.addr.src.ip6 = *saddr; + if (port_ifidx == 0) + return NULL; - /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip6_querier.port, port); + dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx); + if (!dev) + return NULL; + p = br_port_get_rtnl_rcu(dev); + if (!p || p->br != br) + return NULL; - return true; + return p; } -#endif -static bool br_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - struct br_ip *saddr) +size_t br_multicast_querier_state_size(void) { - switch (saddr->proto) { - case htons(ETH_P_IP): - return br_ip4_multicast_select_querier(br, port, saddr->src.ip4); + return nla_total_size(0) + /* nest attribute */ + nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */ + nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */ + nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */ #if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return br_ip6_multicast_select_querier(br, port, &saddr->src.ip6); + nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */ + nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */ + nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */ #endif + 0; +} + +/* protected by rtnl or rcu */ +int br_multicast_dump_querier_state(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx, + int nest_attr) +{ + struct bridge_mcast_querier querier = {}; + struct net_bridge_port *p; + struct nlattr *nest; + + if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || + br_multicast_ctx_vlan_global_disabled(brmctx)) + return 0; + + nest = nla_nest_start(skb, nest_attr); + if (!nest) + return -EMSGSIZE; + + rcu_read_lock(); + if (!brmctx->multicast_querier && + !timer_pending(&brmctx->ip4_other_query.timer)) + goto out_v6; + + br_multicast_read_querier(&brmctx->ip4_querier, &querier); + if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS, + querier.addr.src.ip4)) { + rcu_read_unlock(); + goto out_err; } - return false; + p = __br_multicast_get_querier_port(brmctx->br, &querier); + if (timer_pending(&brmctx->ip4_other_query.timer) && + (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER, + br_timer_value(&brmctx->ip4_other_query.timer), + BRIDGE_QUERIER_PAD) || + (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) { + rcu_read_unlock(); + goto out_err; + } + +out_v6: +#if IS_ENABLED(CONFIG_IPV6) + if (!brmctx->multicast_querier && + !timer_pending(&brmctx->ip6_other_query.timer)) + goto out; + + br_multicast_read_querier(&brmctx->ip6_querier, &querier); + if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS, + &querier.addr.src.ip6)) { + rcu_read_unlock(); + goto out_err; + } + + p = __br_multicast_get_querier_port(brmctx->br, &querier); + if (timer_pending(&brmctx->ip6_other_query.timer) && + (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER, + br_timer_value(&brmctx->ip6_other_query.timer), + BRIDGE_QUERIER_PAD) || + (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT, + p->dev->ifindex)))) { + rcu_read_unlock(); + goto out_err; + } +out: +#endif + rcu_read_unlock(); + nla_nest_end(skb, nest); + if (!nla_len(nest)) + nla_nest_cancel(skb, nest); + + return 0; + +out_err: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static void -br_multicast_update_query_timer(struct net_bridge *br, +br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) query->delay_time = jiffies + max_delay; - mod_timer(&query->timer, jiffies + br->multicast_querier_interval); + mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, @@ -2655,74 +3040,208 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -/* - * Add port to router_list +static struct net_bridge_port * +br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, + struct hlist_head *mc_router_list, + struct hlist_node *rlist) +{ + struct net_bridge_mcast_port *pmctx; + +#if IS_ENABLED(CONFIG_IPV6) + if (mc_router_list == &brmctx->ip6_mc_router_list) + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip6_rlist); + else +#endif + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip4_rlist); + + return pmctx->port; +} + +static struct hlist_node * +br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, + struct net_bridge_port *port, + struct hlist_head *mc_router_list) + +{ + struct hlist_node *slot = NULL; + struct net_bridge_port *p; + struct hlist_node *rlist; + + hlist_for_each(rlist, mc_router_list) { + p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); + + if ((unsigned long)port >= (unsigned long)p) + break; + + slot = rlist; + } + + return slot; +} + +static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, + struct hlist_node *rnode) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (rnode != &pmctx->ip6_rlist) + return hlist_unhashed(&pmctx->ip6_rlist); + else + return hlist_unhashed(&pmctx->ip4_rlist); +#else + return true; +#endif +} + +/* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { - struct net_bridge_port *p; - struct hlist_node *slot = NULL; + struct hlist_node *slot; - if (!hlist_unhashed(&port->rlist)) + if (!hlist_unhashed(rlist)) return; - hlist_for_each_entry(p, &br->router_list, rlist) { - if ((unsigned long) port >= (unsigned long) p) - break; - slot = &p->rlist; - } + slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) - hlist_add_behind_rcu(&port->rlist, slot); + hlist_add_behind_rcu(rlist, slot); else - hlist_add_head_rcu(&port->rlist, &br->router_list); - br_rtr_notify(br->dev, port, RTM_NEWMDB); - br_port_mc_router_state_change(port, true); + hlist_add_head_rcu(rlist, mc_router_list); + + /* For backwards compatibility for now, only notify if we + * switched from no IPv4/IPv6 multicast router to a new + * IPv4 or IPv6 multicast router. + */ + if (br_multicast_no_router_otherpf(pmctx, rlist)) { + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); + br_port_mc_router_state_change(pmctx->port, true); + } +} + +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) +{ + br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, + &brmctx->ip4_mc_router_list); +} + +/* Add port to router_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, + &brmctx->ip6_mc_router_list); +#endif } -static void br_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct timer_list *timer, + struct hlist_node *rlist, + struct hlist_head *mc_router_list) { unsigned long now = jiffies; - if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!timer_pending(&br->multicast_router_timer)) - br_mc_router_state_change(br, true); - mod_timer(&br->multicast_router_timer, - now + br->multicast_querier_interval); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + return; + + if (!pmctx) { + if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!br_ip4_multicast_is_router(brmctx) && + !br_ip6_multicast_is_router(brmctx)) + br_mc_router_state_change(brmctx->br, true); + mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM) + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; - br_multicast_add_router(br, port); + br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); + mod_timer(timer, now + brmctx->multicast_querier_interval); +} + +static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) +{ + struct timer_list *timer = &brmctx->ip4_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (pmctx) { + timer = &pmctx->ip4_mc_router_timer; + rlist = &pmctx->ip4_rlist; + } + + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip4_mc_router_list); +} + +static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct timer_list *timer = &brmctx->ip6_mc_router_timer; + struct hlist_node *rlist = NULL; + + if (pmctx) { + timer = &pmctx->ip6_mc_router_timer; + rlist = &pmctx->ip6_rlist; + } + + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip6_mc_router_list); +#endif +} + +static void +br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) +{ + if (!br_multicast_select_querier(brmctx, pmctx, saddr)) + return; - mod_timer(&port->multicast_router_timer, - now + br->multicast_querier_interval); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip4_multicast_mark_router(brmctx, pmctx); } -static void br_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, - struct bridge_mcast_other_query *query, - struct br_ip *saddr, - unsigned long max_delay) +#if IS_ENABLED(CONFIG_IPV6) +static void +br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct bridge_mcast_other_query *query, + struct br_ip *saddr, + unsigned long max_delay) { - if (!br_multicast_select_querier(br, port, saddr)) + if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; - br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip6_multicast_mark_router(brmctx, pmctx); } +#endif -static void br_ip4_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2733,14 +3252,13 @@ static void br_ip4_multicast_query(struct net_bridge *br, struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct br_ip saddr; + struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; __be32 group; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; @@ -2755,7 +3273,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || - (br->multicast_igmp_version == 3 && group && ih3->suppress)) + (brmctx->multicast_igmp_version == 3 && group && + ih3->suppress)) goto out; max_delay = ih3->code ? @@ -2768,16 +3287,17 @@ static void br_ip4_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; - br_multicast_query_received(br, port, &br->ip4_other_query, - &saddr, max_delay); + br_ip4_multicast_query_received(brmctx, pmctx, + &brmctx->ip4_other_query, + &saddr, max_delay); goto out; } - mp = br_mdb_ip4_get(br, group, vid); + mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? @@ -2786,23 +3306,23 @@ static void br_ip4_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_igmp_version == 2 || + (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2812,7 +3332,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct br_ip saddr; + struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); @@ -2820,9 +3340,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, bool is_general_query; int err = 0; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { @@ -2842,7 +3361,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - if (br->multicast_mld_version == 2 && + if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; @@ -2856,18 +3375,19 @@ static int br_ip6_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; - br_multicast_query_received(br, port, &br->ip6_other_query, - &saddr, max_delay); + br_ip6_multicast_query_received(brmctx, pmctx, + &brmctx->ip6_other_query, + &saddr, max_delay); goto out; } else if (!group) { goto out; } - mp = br_mdb_ip6_get(br, group, vid); + mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : @@ -2875,25 +3395,25 @@ static int br_ip6_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_mld_version == 1 || + (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void -br_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +br_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, @@ -2904,22 +3424,21 @@ br_multicast_leave_group(struct net_bridge *br, unsigned long now; unsigned long time; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_mdb_ip_get(br, group); + mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; - if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { + if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (!br_port_group_equal(p, port, src)) + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) @@ -2934,19 +3453,19 @@ br_multicast_leave_group(struct net_bridge *br, if (timer_pending(&other_query->timer)) goto out; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - __br_multicast_send_query(br, port, NULL, NULL, &mp->addr, + if (brmctx->multicast_querier) { + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); - time = jiffies + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = jiffies + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); - for (p = mlock_dereference(mp->ports, br); - p != NULL; - p = mlock_dereference(p->next, br)) { - if (!br_port_group_equal(p, port, src)) + for (p = mlock_dereference(mp->ports, brmctx->br); + p != NULL && pmctx != NULL; + p = mlock_dereference(p->next, brmctx->br)) { + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && @@ -2961,10 +3480,10 @@ br_multicast_leave_group(struct net_bridge *br, } now = jiffies; - time = now + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = now + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; - if (!port) { + if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : @@ -2975,10 +3494,10 @@ br_multicast_leave_group(struct net_bridge *br, goto out; } - for (p = mlock_dereference(mp->ports, br); + for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; - p = mlock_dereference(p->next, br)) { - if (p->key.port != port) + p = mlock_dereference(p->next, brmctx->br)) { + if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && @@ -2991,11 +3510,11 @@ br_multicast_leave_group(struct net_bridge *br, break; } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) @@ -3006,20 +3525,21 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, if (ipv4_is_local_multicast(group)) return; - own_query = port ? &port->ip4_own_query : &br->ip4_own_query; + own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) @@ -3030,14 +3550,15 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return; - own_query = port ? &port->ip6_own_query : &br->ip6_own_query; + own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip6_other_query, own_query, src); } #endif @@ -3075,8 +3596,8 @@ static void br_multicast_err_count(const struct net_bridge *br, u64_stats_update_end(&pstats->syncp); } -static void br_multicast_pim(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_pim(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); @@ -3087,27 +3608,32 @@ static void br_multicast_pim(struct net_bridge *br, pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; - br_multicast_mark_router(br, port); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; - br_multicast_mark_router(br, port); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); return 0; } -static int br_multicast_ipv4_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; @@ -3119,14 +3645,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) - br_multicast_pim(br, port, skb); + br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { - br_ip4_multicast_mrd_rcv(br, port, skb); + br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3138,42 +3664,45 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group, vid, src, - true); + err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, + src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb, vid); + err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - br_ip4_multicast_query(br, port, skb, vid); + br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: - br_ip4_multicast_leave_group(br, port, ih->group, vid, src); + br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - br_multicast_mark_router(br, port); + spin_lock(&brmctx->br->multicast_lock); + br_ip6_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_multicast_ipv6_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; @@ -3185,11 +3714,11 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) - br_ip6_multicast_mrd_rcv(br, port, skb); + br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3200,29 +3729,32 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, - src, true); + err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, + vid, src, true); break; case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb, vid); + err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb, vid); + err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; - br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src); + br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, + src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; @@ -3230,16 +3762,36 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; + if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { + const struct net_bridge_vlan *masterv; + + /* the vlan has the master flag set only when transmitting + * through the bridge device + */ + if (br_vlan_is_master(vlan)) { + masterv = vlan; + *brmctx = &vlan->br_mcast_ctx; + *pmctx = NULL; + } else { + masterv = vlan->brvlan; + *brmctx = &vlan->brvlan->br_mcast_ctx; + *pmctx = &vlan->port_mcast_ctx; + } + + if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return 0; + } + switch (skb->protocol) { case htons(ETH_P_IP): - ret = br_multicast_ipv4_rcv(br, port, skb, vid); + ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - ret = br_multicast_ipv6_rcv(br, port, skb, vid); + ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } @@ -3247,32 +3799,39 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, return ret; } -static void br_multicast_query_expired(struct net_bridge *br, +static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query, struct bridge_mcast_querier *querier) { - spin_lock(&br->multicast_lock); - if (query->startup_sent < br->multicast_startup_query_count) + spin_lock(&brmctx->br->multicast_lock); + if (br_multicast_ctx_vlan_disabled(brmctx)) + goto out; + + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; - RCU_INIT_POINTER(querier->port, NULL); - br_multicast_send_query(br, NULL, query); - spin_unlock(&br->multicast_lock); + br_multicast_send_query(brmctx, NULL, query); +out: + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_own_query.timer); - br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, + &brmctx->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_own_query.timer); - br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, + &brmctx->ip6_querier); } #endif @@ -3289,45 +3848,65 @@ static void br_multicast_gc_work(struct work_struct *work) br_multicast_gc(&deleted_head); } -void br_multicast_init(struct net_bridge *br) +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) { - br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; - - br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - br->multicast_last_member_count = 2; - br->multicast_startup_query_count = 2; + brmctx->br = br; + brmctx->vlan = vlan; + brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + brmctx->multicast_last_member_count = 2; + brmctx->multicast_startup_query_count = 2; - br->multicast_last_member_interval = HZ; - br->multicast_query_response_interval = 10 * HZ; - br->multicast_startup_query_interval = 125 * HZ / 4; - br->multicast_query_interval = 125 * HZ; - br->multicast_querier_interval = 255 * HZ; - br->multicast_membership_interval = 260 * HZ; + brmctx->multicast_last_member_interval = HZ; + brmctx->multicast_query_response_interval = 10 * HZ; + brmctx->multicast_startup_query_interval = 125 * HZ / 4; + brmctx->multicast_query_interval = 125 * HZ; + brmctx->multicast_querier_interval = 255 * HZ; + brmctx->multicast_membership_interval = 260 * HZ; - br->ip4_other_query.delay_time = 0; - br->ip4_querier.port = NULL; - br->multicast_igmp_version = 2; + brmctx->ip4_other_query.delay_time = 0; + brmctx->ip4_querier.port_ifidx = 0; + seqcount_init(&brmctx->ip4_querier.seq); + brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) - br->multicast_mld_version = 1; - br->ip6_other_query.delay_time = 0; - br->ip6_querier.port = NULL; + brmctx->multicast_mld_version = 1; + brmctx->ip6_other_query.delay_time = 0; + brmctx->ip6_querier.port_ifidx = 0; + seqcount_init(&brmctx->ip6_querier.seq); #endif - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); - spin_lock_init(&br->multicast_lock); - timer_setup(&br->multicast_router_timer, - br_multicast_local_router_expired, 0); - timer_setup(&br->ip4_other_query.timer, + timer_setup(&brmctx->ip4_mc_router_timer, + br_ip4_multicast_local_router_expired, 0); + timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); - timer_setup(&br->ip4_own_query.timer, + timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - timer_setup(&br->ip6_other_query.timer, + timer_setup(&brmctx->ip6_mc_router_timer, + br_ip6_multicast_local_router_expired, 0); + timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); - timer_setup(&br->ip6_own_query.timer, + timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif +} + +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ + __br_multicast_stop(brmctx); +} + +void br_multicast_init(struct net_bridge *br) +{ + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; + + br_multicast_ctx_init(br, NULL, &br->multicast_ctx); + + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + + spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); @@ -3395,8 +3974,8 @@ void br_multicast_leave_snoopers(struct net_bridge *br) br_ip6_multicast_leave_snoopers(br); } -static void __br_multicast_open(struct net_bridge *br, - struct bridge_mcast_own_query *query) +static void __br_multicast_open_query(struct net_bridge *br, + struct bridge_mcast_own_query *query) { query->startup_sent = 0; @@ -3406,25 +3985,194 @@ static void __br_multicast_open(struct net_bridge *br, mod_timer(&query->timer, jiffies); } -void br_multicast_open(struct net_bridge *br) +static void __br_multicast_open(struct net_bridge_mcast *brmctx) { - __br_multicast_open(br, &br->ip4_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - __br_multicast_open(br, &br->ip6_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } -void br_multicast_stop(struct net_bridge *br) +void br_multicast_open(struct net_bridge *br) +{ + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_open(&vlan->br_mcast_ctx); + } + } + } else { + __br_multicast_open(&br->multicast_ctx); + } +} + +static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { - del_timer_sync(&br->multicast_router_timer); - del_timer_sync(&br->ip4_other_query.timer); - del_timer_sync(&br->ip4_own_query.timer); + del_timer_sync(&brmctx->ip4_mc_router_timer); + del_timer_sync(&brmctx->ip4_other_query.timer); + del_timer_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&br->ip6_other_query.timer); - del_timer_sync(&br->ip6_own_query.timer); + del_timer_sync(&brmctx->ip6_mc_router_timer); + del_timer_sync(&brmctx->ip6_other_query.timer); + del_timer_sync(&brmctx->ip6_own_query.timer); #endif } +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge *br; + + /* it's okay to check for the flag without the multicast lock because it + * can only change under RTNL -> multicast_lock, we need the latter to + * sync with timers and packets + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) + return; + + if (br_vlan_is_master(vlan)) { + br = vlan->br; + + if (!br_vlan_is_brentry(vlan) || + (on && + br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) + return; + + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + spin_unlock_bh(&br->multicast_lock); + + if (on) + __br_multicast_open(&vlan->br_mcast_ctx); + else + __br_multicast_stop(&vlan->br_mcast_ctx); + } else { + struct net_bridge_mcast *brmctx; + + brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); + if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) + return; + + br = vlan->port->br; + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + if (on) + __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); + else + __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); + spin_unlock_bh(&br->multicast_lock); + } +} + +static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge_port *p; + + if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) + return; + + list_for_each_entry(p, &vlan->br->port_list, list) { + struct net_bridge_vlan *vport; + + vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); + if (!vport) + continue; + br_multicast_toggle_one_vlan(vport, on); + } + + if (br_vlan_is_brentry(vlan)) + br_multicast_toggle_one_vlan(vlan, on); +} + +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + struct net_bridge_port *p; + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) + return 0; + + if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); + return -EINVAL; + } + + vg = br_vlan_group(br); + if (!vg) + return 0; + + br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); + + /* disable/enable non-vlan mcast contexts based on vlan snooping */ + if (on) + __br_multicast_stop(&br->multicast_ctx); + else + __br_multicast_open(&br->multicast_ctx); + list_for_each_entry(p, &br->port_list, list) { + if (on) + br_multicast_disable_port(p); + else + br_multicast_enable_port(p); + } + + list_for_each_entry(vlan, &vg->vlan_list, vlist) + br_multicast_toggle_vlan(vlan, on); + + return 0; +} + +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) +{ + ASSERT_RTNL(); + + /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and + * requires only RTNL to change + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return false; + + vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; + br_multicast_toggle_vlan(vlan, on); + + return true; +} + +void br_multicast_stop(struct net_bridge *br) +{ + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_stop(&vlan->br_mcast_ctx); + } + } + } else { + __br_multicast_stop(&br->multicast_ctx); + } +} + void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; @@ -3437,114 +4185,189 @@ void br_multicast_dev_del(struct net_bridge *br) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); + br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); rcu_barrier(); } -int br_multicast_set_router(struct net_bridge *br, unsigned long val) +int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) { int err = -EINVAL; - spin_lock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: - br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); - del_timer(&br->multicast_router_timer); - br->multicast_router = val; + br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); + del_timer(&brmctx->ip4_mc_router_timer); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&brmctx->ip6_mc_router_timer); +#endif + brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: - if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) - br_mc_router_state_change(br, false); - br->multicast_router = val; + if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + br_mc_router_state_change(brmctx->br, false); + brmctx->multicast_router = val; err = 0; break; } - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return err; } -static void __del_port_router(struct net_bridge_port *p) +static void +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { - if (hlist_unhashed(&p->rlist)) + if (!deleted) + return; + + /* For backwards compatibility for now, only notify if there is + * no multicast router anymore for both IPv4 and IPv6. + */ + if (!hlist_unhashed(&pmctx->ip4_rlist)) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (!hlist_unhashed(&pmctx->ip6_rlist)) return; - hlist_del_init_rcu(&p->rlist); - br_rtr_notify(p->br->dev, p, RTM_DELMDB); - br_port_mc_router_state_change(p, false); +#endif + + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); + br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } -int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) +int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, + unsigned long val) { - struct net_bridge *br = p->br; + struct net_bridge_mcast *brmctx; unsigned long now = jiffies; int err = -EINVAL; + bool del = false; - spin_lock(&br->multicast_lock); - if (p->multicast_router == val) { + brmctx = br_multicast_port_ctx_get_global(pmctx); + spin_lock_bh(&brmctx->br->multicast_lock); + if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) - mod_timer(&p->multicast_router_timer, - now + br->multicast_querier_interval); + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { + mod_timer(&pmctx->ip4_mc_router_timer, + now + brmctx->multicast_querier_interval); +#if IS_ENABLED(CONFIG_IPV6) + mod_timer(&pmctx->ip6_mc_router_timer, + now + brmctx->multicast_querier_interval); +#endif + } err = 0; goto unlock; } switch (val) { case MDB_RTR_TYPE_DISABLED: - p->multicast_router = MDB_RTR_TYPE_DISABLED; - __del_port_router(p); - del_timer(&p->multicast_router_timer); + pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del |= br_ip6_multicast_rport_del(pmctx); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&pmctx->ip6_mc_router_timer); +#endif + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - __del_port_router(p); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + del |= br_ip4_multicast_rport_del(pmctx); + del |= br_ip6_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: - p->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&p->multicast_router_timer); - br_multicast_add_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_PERM; + del_timer(&pmctx->ip4_mc_router_timer); + br_ip4_multicast_add_router(brmctx, pmctx); +#if IS_ENABLED(CONFIG_IPV6) + del_timer(&pmctx->ip6_mc_router_timer); +#endif + br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: - p->multicast_router = MDB_RTR_TYPE_TEMP; - br_multicast_mark_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP; + br_ip4_multicast_mark_router(brmctx, pmctx); + br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: - spin_unlock(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return err; } -static void br_multicast_start_querier(struct net_bridge *br, +int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router) +{ + int err; + + if (br_vlan_is_master(v)) + err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router); + else + err = br_multicast_set_port_router(&v->port_mcast_ctx, + mcast_router); + + return err; +} + +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; - __br_multicast_open(br, query); + if (!br_multicast_ctx_matches_vlan_snooping(brmctx)) + return; + + __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); - list_for_each_entry_rcu(port, &br->port_list, list) { - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { + struct bridge_mcast_own_query *ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query *ip6_own_query; +#endif + + if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; - if (query == &br->ip4_own_query) - br_multicast_enable(&port->ip4_own_query); + if (br_multicast_ctx_is_vlan(brmctx)) { + struct net_bridge_vlan *vlan; + + vlan = br_vlan_find(nbp_vlan_group_rcu(port), + brmctx->vlan->vid); + if (!vlan || + br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) + continue; + + ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; +#endif + } else { + ip4_own_query = &port->multicast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &port->multicast_ctx.ip6_own_query; +#endif + } + + if (query == &brmctx->ip4_own_query) + br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); @@ -3578,7 +4401,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) - __br_multicast_enable_port(port); + __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; @@ -3621,47 +4444,48 @@ bool br_multicast_router(const struct net_device *dev) bool is_router; spin_lock_bh(&br->multicast_lock); - is_router = br_multicast_is_router(br); + is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } EXPORT_SYMBOL_GPL(br_multicast_router); -int br_multicast_set_querier(struct net_bridge *br, unsigned long val) +int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long max_delay; val = !!val; - spin_lock_bh(&br->multicast_lock); - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val) + spin_lock_bh(&brmctx->br->multicast_lock); + if (brmctx->multicast_querier == val) goto unlock; - br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val); + WRITE_ONCE(brmctx->multicast_querier, val); if (!val) goto unlock; - max_delay = br->multicast_query_response_interval; + max_delay = brmctx->multicast_query_response_interval; - if (!timer_pending(&br->ip4_other_query.timer)) - br->ip4_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip4_other_query.timer)) + brmctx->ip4_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip4_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - if (!timer_pending(&br->ip6_other_query.timer)) - br->ip6_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip6_other_query.timer)) + brmctx->ip6_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip6_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } -int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) +int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, + unsigned long val) { /* Currently we support only version 2 and 3 */ switch (val) { @@ -3672,15 +4496,16 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) return -EINVAL; } - spin_lock_bh(&br->multicast_lock); - br->multicast_igmp_version = val; - spin_unlock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); + brmctx->multicast_igmp_version = val; + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #if IS_ENABLED(CONFIG_IPV6) -int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) +int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, + unsigned long val) { /* Currently we support version 1 and 2 */ switch (val) { @@ -3691,9 +4516,9 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) return -EINVAL; } - spin_lock_bh(&br->multicast_lock); - br->multicast_mld_version = val; - spin_unlock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); + brmctx->multicast_mld_version = val; + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } @@ -3785,7 +4610,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); - ret = br_multicast_querier_exists(br, ð, NULL); + ret = br_multicast_querier_exists(&br->multicast_ctx, ð, NULL); unlock: rcu_read_unlock(); @@ -3804,9 +4629,11 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { + struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; + int port_ifidx; rcu_read_lock(); if (!netif_is_bridge_port(dev)) @@ -3817,17 +4644,20 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) goto unlock; br = port->br; + brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: - if (!timer_pending(&br->ip4_other_query.timer) || - rcu_dereference(br->ip4_querier.port) == port) + port_ifidx = brmctx->ip4_querier.port_ifidx; + if (!timer_pending(&brmctx->ip4_other_query.timer) || + port_ifidx == port->dev->ifindex) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: - if (!timer_pending(&br->ip6_other_query.timer) || - rcu_dereference(br->ip6_querier.port) == port) + port_ifidx = brmctx->ip6_querier.port_ifidx; + if (!timer_pending(&brmctx->ip6_other_query.timer) || + port_ifidx == port->dev->ifindex) goto unlock; break; #endif @@ -3842,6 +4672,64 @@ unlock: } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); +/** + * br_multicast_has_router_adjacent - Checks for a router behind a bridge port + * @dev: The bridge port adjacent to which to check for a multicast router + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a multicast router is behind one of the other ports of this + * bridge. Otherwise returns false. + */ +bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) +{ + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; + struct net_bridge_port *port; + bool ret = false; + + rcu_read_lock(); + port = br_port_get_check_rcu(dev); + if (!port) + goto unlock; + + brmctx = &port->br->multicast_ctx; + switch (proto) { + case ETH_P_IP: + hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, + ip4_rlist) { + if (pmctx->port == port) + continue; + + ret = true; + goto unlock; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case ETH_P_IPV6: + hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, + ip6_rlist) { + if (pmctx->port == port) + continue; + + ret = true; + goto unlock; + } + break; +#endif + default: + /* when compiled without IPv6 support, be conservative and + * always assume presence of an IPv6 multicast router + */ + ret = true; + } + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); + static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { @@ -3913,7 +4801,8 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, u64_stats_update_end(&pstats->syncp); } -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index 13290a749d09..f91c071d1608 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -33,7 +33,8 @@ static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr); -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -388,7 +389,8 @@ static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, } } -static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, +static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, int filter_mode) { @@ -405,14 +407,15 @@ static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); break; case MCAST_EXCLUDE: - br_multicast_create_eht_set_entry(pg, &zero_addr, h_addr, - MCAST_EXCLUDE, + br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr, + h_addr, MCAST_EXCLUDE, true); break; } } -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -441,8 +444,8 @@ static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, if (!set_h) goto fail_set_entry; - mod_timer(&set_h->timer, jiffies + br_multicast_gmi(br)); - mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx)); + mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx)); return; @@ -499,7 +502,8 @@ static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, } /* create new set entries from reports */ -static void __eht_create_set_entries(struct net_bridge_port_group *pg, +static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -512,8 +516,8 @@ static void __eht_create_set_entries(struct net_bridge_port_group *pg, memset(&eht_src_addr, 0, sizeof(eht_src_addr)); for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - filter_mode, + br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr, + h_addr, filter_mode, false); } } @@ -549,7 +553,8 @@ static bool __eht_del_set_entries(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, +static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -559,8 +564,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, - MCAST_INCLUDE); + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, MCAST_INCLUDE); break; case MCAST_EXCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, @@ -571,7 +576,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_block(struct net_bridge_port_group *pg, +static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -585,7 +591,7 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, addr_size); break; case MCAST_EXCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE); break; } @@ -594,7 +600,8 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, } /* flush_entries is true when changing mode */ -static bool __eht_inc_exc(struct net_bridge_port_group *pg, +static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -612,11 +619,10 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_eht_set *eht_set; struct net_bridge_group_src *src_ent; struct hlist_node *tmp; @@ -647,14 +653,15 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, &eht_src_addr); if (!eht_set) continue; - mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(br)); + mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx)); } } return changed; } -static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -663,14 +670,15 @@ static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_INCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE); return changed; } -static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -679,14 +687,15 @@ static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_EXCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE); return changed; } -static bool __eht_ip4_handle(struct net_bridge_port_group *pg, +static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -696,24 +705,25 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, switch (grec_type) { case IGMPV3_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, sizeof(__be32)); + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, + sizeof(__be32)); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; case IGMPV3_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; } @@ -722,7 +732,8 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, } #if IS_ENABLED(CONFIG_IPV6) -static bool __eht_ip6_handle(struct net_bridge_port_group *pg, +static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -732,18 +743,18 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, switch (grec_type) { case MLD2_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -751,7 +762,7 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, to_report = true; fallthrough; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -762,7 +773,8 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, #endif /* true means an entry was deleted */ -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, @@ -779,12 +791,12 @@ bool br_multicast_eht_handle(struct net_bridge_port_group *pg, memset(&eht_host_addr, 0, sizeof(eht_host_addr)); memcpy(&eht_host_addr, h_addr, addr_size); if (addr_size == sizeof(__be32)) - changed = __eht_ip4_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #if IS_ENABLED(CONFIG_IPV6) else - changed = __eht_ip6_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #endif out: diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e4e6e991313e..6c58fc14d2cb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -287,7 +287,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, - p->multicast_router) || + p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, @@ -932,7 +932,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); - err = br_multicast_set_port_router(p, mcast_router); + err = br_multicast_set_port_router(&p->multicast_ctx, + mcast_router); if (err) return err; } @@ -1286,7 +1287,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); - err = br_multicast_set_router(br, multicast_router); + err = br_multicast_set_router(&br->multicast_ctx, + multicast_router); if (err) return err; } @@ -1309,7 +1311,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); - err = br_multicast_set_querier(br, mcast_querier); + err = br_multicast_set_querier(&br->multicast_ctx, + mcast_querier); if (err) return err; } @@ -1324,49 +1327,49 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { @@ -1380,7 +1383,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); - err = br_multicast_set_igmp_version(br, igmp_version); + err = br_multicast_set_igmp_version(&br->multicast_ctx, + igmp_version); if (err) return err; } @@ -1390,7 +1394,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); - err = br_multicast_set_mld_version(br, mld_version); + err = br_multicast_set_mld_version(&br->multicast_ctx, + mld_version); if (err) return err; } @@ -1497,6 +1502,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ + br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1566,50 +1572,53 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, + br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, - br_opt_get(br, BROPT_MULTICAST_QUERIER)) || + br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, - br->multicast_last_member_count) || + br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, - br->multicast_startup_query_count) || + br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, - br->multicast_igmp_version)) + br->multicast_ctx.multicast_igmp_version) || + br_multicast_dump_querier_state(skb, &br->multicast_ctx, + IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, - br->multicast_mld_version)) + br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif - clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_membership_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_querier_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; @@ -1644,7 +1653,6 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) p = br_port_get_rtnl(dev); if (!p) return 0; - br = p->br; vg = nbp_vlan_group(p); break; default: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e013d33f1c7c..b4cef3a97f12 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,6 +29,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 +#define BR_HWDOM_MAX BITS_PER_LONG + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -79,7 +81,8 @@ struct bridge_mcast_other_query { /* selected querier */ struct bridge_mcast_querier { struct br_ip addr; - struct net_bridge_port __rcu *port; + int port_ifidx; + seqcount_t seq; }; /* IGMP/MLD statistics */ @@ -89,6 +92,60 @@ struct bridge_mcast_stats { }; #endif +/* net_bridge_mcast_port must be always defined due to forwarding stubs */ +struct net_bridge_mcast_port { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge_port *port; + struct net_bridge_vlan *vlan; + + struct bridge_mcast_own_query ip4_own_query; + struct timer_list ip4_mc_router_timer; + struct hlist_node ip4_rlist; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query ip6_own_query; + struct timer_list ip6_mc_router_timer; + struct hlist_node ip6_rlist; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + unsigned char multicast_router; +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + +/* net_bridge_mcast must be always defined due to forwarding stubs */ +struct net_bridge_mcast { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge *br; + struct net_bridge_vlan *vlan; + + u32 multicast_last_member_count; + u32 multicast_startup_query_count; + + u8 multicast_querier; + u8 multicast_igmp_version; + u8 multicast_router; +#if IS_ENABLED(CONFIG_IPV6) + u8 multicast_mld_version; +#endif + unsigned long multicast_last_member_interval; + unsigned long multicast_membership_interval; + unsigned long multicast_querier_interval; + unsigned long multicast_query_interval; + unsigned long multicast_query_response_interval; + unsigned long multicast_startup_query_interval; + struct hlist_head ip4_mc_router_list; + struct timer_list ip4_mc_router_timer; + struct bridge_mcast_other_query ip4_other_query; + struct bridge_mcast_own_query ip4_own_query; + struct bridge_mcast_querier ip4_querier; +#if IS_ENABLED(CONFIG_IPV6) + struct hlist_head ip6_mc_router_list; + struct timer_list ip6_mc_router_timer; + struct bridge_mcast_other_query ip6_other_query; + struct bridge_mcast_own_query ip6_own_query; + struct bridge_mcast_querier ip6_querier; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst __rcu *tunnel_dst; @@ -98,6 +155,8 @@ struct br_tunnel_info { enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), + BR_VLFLAG_MCAST_ENABLED = BIT(2), + BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3), }; /** @@ -114,6 +173,9 @@ enum { * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry + * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context + * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast + * context * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * @@ -141,6 +203,11 @@ struct net_bridge_vlan { struct br_tunnel_info tinfo; + union { + struct net_bridge_mcast br_mcast_ctx; + struct net_bridge_mcast_port port_mcast_ctx; + }; + struct list_head vlist; struct rcu_head rcu; @@ -305,18 +372,14 @@ struct net_bridge_port { struct kobject kobj; struct rcu_head rcu; + struct net_bridge_mcast_port multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - struct bridge_mcast_own_query ip4_own_query; -#if IS_ENABLED(CONFIG_IPV6) - struct bridge_mcast_own_query ip6_own_query; -#endif /* IS_ENABLED(CONFIG_IPV6) */ + struct bridge_mcast_stats __percpu *mcast_stats; + u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; - unsigned char multicast_router; - struct bridge_mcast_stats __percpu *mcast_stats; - struct timer_list multicast_router_timer; struct hlist_head mglist; - struct hlist_node rlist; #endif #ifdef CONFIG_SYSFS @@ -327,7 +390,12 @@ struct net_bridge_port { struct netpoll *np; #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Identifier used to group ports that share the same switchdev + * hardware domain. + */ + int hwdom; + int offload_count; + struct netdev_phys_item_id ppid; #endif u16 group_fwd_mask; u16 backup_redirected_cnt; @@ -365,7 +433,6 @@ enum net_bridge_opts { BROPT_NF_CALL_ARPTABLES, BROPT_GROUP_ADDR_SET, BROPT_MULTICAST_ENABLED, - BROPT_MULTICAST_QUERIER, BROPT_MULTICAST_QUERY_USE_IFADDR, BROPT_MULTICAST_STATS_ENABLED, BROPT_HAS_IPV6_ADDR, @@ -374,6 +441,7 @@ enum net_bridge_opts { BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, + BROPT_MCAST_VLAN_SNOOPING_ENABLED, }; struct net_bridge { @@ -424,43 +492,21 @@ struct net_bridge { BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; + struct net_bridge_mcast multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct bridge_mcast_stats __percpu *mcast_stats; u32 hash_max; - u32 multicast_last_member_count; - u32 multicast_startup_query_count; - - u8 multicast_igmp_version; - u8 multicast_router; -#if IS_ENABLED(CONFIG_IPV6) - u8 multicast_mld_version; -#endif spinlock_t multicast_lock; - unsigned long multicast_last_member_interval; - unsigned long multicast_membership_interval; - unsigned long multicast_querier_interval; - unsigned long multicast_query_interval; - unsigned long multicast_query_response_interval; - unsigned long multicast_startup_query_interval; struct rhashtable mdb_hash_tbl; struct rhashtable sg_port_tbl; struct hlist_head mcast_gc_list; struct hlist_head mdb_list; - struct hlist_head router_list; - struct timer_list multicast_router_timer; - struct bridge_mcast_other_query ip4_other_query; - struct bridge_mcast_own_query ip4_own_query; - struct bridge_mcast_querier ip4_querier; - struct bridge_mcast_stats __percpu *mcast_stats; -#if IS_ENABLED(CONFIG_IPV6) - struct bridge_mcast_other_query ip6_other_query; - struct bridge_mcast_own_query ip6_own_query; - struct bridge_mcast_querier ip6_querier; -#endif /* IS_ENABLED(CONFIG_IPV6) */ struct work_struct mcast_gc_work; #endif @@ -472,7 +518,12 @@ struct net_bridge { u32 auto_cnt; #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Counter used to make sure that hardware domains get unique + * identifiers in case a bridge spans multiple switchdev instances. + */ + int last_hwdom; + /* Bit mask of hardware domain numbers in use */ + unsigned long busy_hwdoms; #endif struct hlist_head fdb_list; @@ -502,7 +553,20 @@ struct br_input_skb_cb { #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Set if TX data plane offloading is used towards at least one + * hardware domain. + */ + u8 tx_fwd_offload:1; + /* The switchdev hardware domain from which this packet was received. + * If skb->offload_fwd_mark was set, then this packet was already + * forwarded by hardware to the other ports in the source hardware + * domain, otherwise it wasn't. + */ + int src_hwdom; + /* Bit mask of hardware domains towards this packet has already been + * transmitted using the TX data plane offload. + */ + unsigned long fwd_hwdoms; #endif }; @@ -612,6 +676,20 @@ static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, return true; } +static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v) +{ + u8 mcast_router = MDB_RTR_TYPE_DISABLED; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (!br_vlan_is_master(v)) + mcast_router = v->port_mcast_ctx.multicast_router; + else + mcast_router = v->br_mcast_ctx.multicast_router; +#endif + + return mcast_router; +} + static inline int br_afspec_cmd_to_rtm(int cmd) { switch (cmd) { @@ -714,6 +792,8 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded); +int br_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, + struct notifier_block *nb); /* br_forward.c */ enum br_pkt_type { @@ -786,15 +866,18 @@ br_port_get_check_rtnl(const struct net_device *dev) } /* br_ioctl.c */ -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, - void __user *arg); +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, + void __user *data, int cmd); +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid); -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid); int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); @@ -806,17 +889,22 @@ void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); -void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, bool local_rcv, bool local_orig); -int br_multicast_set_router(struct net_bridge *br, unsigned long val); -int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, + bool local_rcv, bool local_orig); +int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val); +int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, + unsigned long val); +int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); -int br_multicast_set_querier(struct net_bridge *br, unsigned long val); +int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); -int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); +int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, + unsigned long val); #if IS_ENABLED(CONFIG_IPV6) -int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); +int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, + unsigned long val); #endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); @@ -831,12 +919,13 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp); -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_uninit_stats(struct net_bridge *br); @@ -845,7 +934,8 @@ void br_multicast_get_stats(const struct net_bridge *br, struct br_mcast_stats *dest); void br_mdb_init(void); void br_mdb_uninit(void); -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); @@ -855,6 +945,29 @@ struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx); +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx); +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx); +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on); + +int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack); +int br_rports_fill_info(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx); +int br_multicast_dump_querier_state(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx, + int nest_attr); +size_t br_multicast_querier_state_size(void); +size_t br_rports_size(const struct net_bridge_mcast *brmctx); static inline bool br_group_is_l2(const struct br_ip *group) { @@ -864,22 +977,82 @@ static inline bool br_group_is_l2(const struct br_ip *group) #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline struct hlist_node * +br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx, + struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list)); +#endif + return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list)); +} + +static inline struct net_bridge_port * +br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { - return br->multicast_router == 2 || - (br->multicast_router == 1 && - timer_pending(&br->multicast_router_timer)); + struct net_bridge_mcast_port *mctx; + +#if IS_ENABLED(CONFIG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip6_rlist); + else +#endif + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip4_rlist); + + if (mctx) + return mctx->port; + else + return NULL; +} + +static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx) +{ + return timer_pending(&brmctx->ip4_mc_router_timer); +} + +static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + return timer_pending(&brmctx->ip6_mc_router_timer); +#else + return false; +#endif +} + +static inline bool +br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) +{ + switch (brmctx->multicast_router) { + case MDB_RTR_TYPE_PERM: + return true; + case MDB_RTR_TYPE_TEMP_QUERY: + if (skb) { + if (skb->protocol == htons(ETH_P_IP)) + return br_ip4_multicast_is_router(brmctx); + else if (skb->protocol == htons(ETH_P_IPV6)) + return br_ip6_multicast_is_router(brmctx); + } else { + return br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx); + } + fallthrough; + default: + return false; + } } static inline bool -__br_multicast_querier_exists(struct net_bridge *br, - struct bridge_mcast_other_query *querier, - const bool is_ipv6) +__br_multicast_querier_exists(struct net_bridge_mcast *brmctx, + struct bridge_mcast_other_query *querier, + const bool is_ipv6) { bool own_querier_enabled; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR)) + if (brmctx->multicast_querier) { + if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; @@ -891,18 +1064,18 @@ __br_multicast_querier_exists(struct net_bridge *br, (own_querier_enabled || timer_pending(&querier->timer)); } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): - return __br_multicast_querier_exists(br, - &br->ip4_other_query, false); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip4_other_query, false); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): - return __br_multicast_querier_exists(br, - &br->ip6_other_query, true); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip6_other_query, true); #endif default: return !!mdb && br_group_is_l2(&mdb->addr); @@ -923,15 +1096,16 @@ static inline bool br_multicast_is_star_g(const struct br_ip *ip) } } -static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, - __be16 proto) +static inline bool +br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx, + __be16 proto) { switch (proto) { case htons(ETH_P_IP): - return !!(br->multicast_igmp_version == 3); + return !!(brmctx->multicast_igmp_version == 3); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return !!(br->multicast_mld_version == 2); + return !!(brmctx->multicast_mld_version == 2); #endif default: return false; @@ -943,28 +1117,145 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) return BR_INPUT_SKB_CB(skb)->igmp; } -static inline unsigned long br_multicast_lmqt(const struct net_bridge *br) +static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx) { - return br->multicast_last_member_interval * - br->multicast_last_member_count; + return brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count; } -static inline unsigned long br_multicast_gmi(const struct net_bridge *br) +static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { /* use the RFC default of 2 for QRV */ - return 2 * br->multicast_query_interval + - br->multicast_query_response_interval; + return 2 * brmctx->multicast_query_interval + + brmctx->multicast_query_response_interval; } + +static inline bool +br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx) +{ + return !!brmctx->vlan; +} + +static inline bool +br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx) +{ + return !!pmctx->vlan; +} + +static inline struct net_bridge_mcast * +br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx) +{ + if (!br_multicast_port_ctx_is_vlan(pmctx)) + return &pmctx->port->br->multicast_ctx; + else + return &pmctx->vlan->brvlan->br_mcast_ctx; +} + +static inline bool +br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED); +} + +static inline bool +br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return pmctx->port->state == BR_STATE_DISABLED || + (br_multicast_port_ctx_is_vlan(pmctx) && + (br_multicast_port_ctx_vlan_disabled(pmctx) || + pmctx->vlan->state == BR_STATE_DISABLED)); +} + +static inline bool +br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_port_ctx_state_disabled(pmctx) || + pmctx->port->state == BR_STATE_BLOCKING || + (br_multicast_port_ctx_is_vlan(pmctx) && + pmctx->vlan->state == BR_STATE_BLOCKING); +} + +static inline bool +br_rports_have_mc_router(const struct net_bridge_mcast *brmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + return !hlist_empty(&brmctx->ip4_mc_router_list) || + !hlist_empty(&brmctx->ip6_mc_router_list); #else -static inline int br_multicast_rcv(struct net_bridge *br, - struct net_bridge_port *port, + return !hlist_empty(&brmctx->ip4_mc_router_list); +#endif +} + +static inline bool +br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, + const struct net_bridge_mcast *brmctx2) +{ + return brmctx1->multicast_igmp_version == + brmctx2->multicast_igmp_version && + brmctx1->multicast_last_member_count == + brmctx2->multicast_last_member_count && + brmctx1->multicast_startup_query_count == + brmctx2->multicast_startup_query_count && + brmctx1->multicast_last_member_interval == + brmctx2->multicast_last_member_interval && + brmctx1->multicast_membership_interval == + brmctx2->multicast_membership_interval && + brmctx1->multicast_querier_interval == + brmctx2->multicast_querier_interval && + brmctx1->multicast_query_interval == + brmctx2->multicast_query_interval && + brmctx1->multicast_query_response_interval == + brmctx2->multicast_query_response_interval && + brmctx1->multicast_startup_query_interval == + brmctx2->multicast_startup_query_interval && + brmctx1->multicast_querier == brmctx2->multicast_querier && + brmctx1->multicast_router == brmctx2->multicast_router && + !br_rports_have_mc_router(brmctx1) && + !br_rports_have_mc_router(brmctx2) && +#if IS_ENABLED(CONFIG_IPV6) + brmctx1->multicast_mld_version == + brmctx2->multicast_mld_version && +#endif + true; +} + +static inline bool +br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) +{ + bool vlan_snooping_enabled; + + vlan_snooping_enabled = !!br_opt_get(brmctx->br, + BROPT_MCAST_VLAN_SNOOPING_ENABLED); + + return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); +} +#else +static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { return 0; } -static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { return NULL; @@ -1013,16 +1304,18 @@ static inline void br_multicast_dev_del(struct net_bridge *br) static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { } -static inline bool br_multicast_is_router(struct net_bridge *br) +static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, + struct sk_buff *skb) { return false; } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { @@ -1066,13 +1359,67 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return 0; } + +static inline void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, + bool on) +{ +} + +static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br, + bool on, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, + bool on) +{ + return false; +} + +static inline int br_mdb_replay(struct net_device *br_dev, + struct net_device *dev, const void *ctx, + bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool +br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, + const struct net_bridge_mcast *brmctx2) +{ + return true; +} #endif /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state); + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -1116,6 +1463,9 @@ void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd); +int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack); bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); @@ -1184,8 +1534,11 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) + { + *vlan = NULL; return true; } @@ -1358,6 +1711,14 @@ static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, { return true; } + +static inline int br_vlan_replay(struct net_device *br_dev, + struct net_device *dev, const void *ctx, + bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} #endif /* br_vlan_options.c */ @@ -1372,6 +1733,14 @@ int br_vlan_process_options(const struct net_bridge *br, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack); +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end); +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts); /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) @@ -1593,7 +1962,25 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; } /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV -int nbp_switchdev_mark_set(struct net_bridge_port *p); +int br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack); + +void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb); + +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); + +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); + +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb); +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, @@ -1602,20 +1989,55 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack); -void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, - int type); +void br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); +void br_switchdev_init(struct net_bridge *br); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { skb->offload_fwd_mark = 0; } #else -static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +static inline int +br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline void +br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ +} + +static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) +{ + return false; +} + +static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) { - return 0; } static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, @@ -1650,13 +2072,19 @@ static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) } static inline void -br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type) { } static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { } + +static inline void br_switchdev_init(struct net_bridge *br) +{ +} + #endif /* CONFIG_NET_SWITCHDEV */ /* br_arp_nd_proxy.c */ diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h index f89049f4892c..adf82a05515a 100644 --- a/net/bridge/br_private_mcast_eht.h +++ b/net/bridge/br_private_mcast_eht.h @@ -51,7 +51,8 @@ struct net_bridge_group_eht_set { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg); -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 9559aa2750fb..bda8e1896712 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -6,6 +6,8 @@ #include "br_private.h" #include <uapi/linux/mrp_bridge.h> +#define MRP_OPT_PADDING 0x2 + struct br_mrp { /* list of mrp instances */ struct hlist_node list; @@ -134,4 +136,13 @@ struct br_mrp_in_test_hdr { __be32 timestamp; } __attribute__((__packed__)); +struct br_mrp_oui_hdr { + __u8 oui[MRP_OUI_LENGTH]; +}; + +struct br_mrp_sub_option1_hdr { + __u8 type; + __u8 data[MRP_MANUFACTURE_DATA_LENGTH]; +}; + #endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index c54cc26211d7..2b053289f016 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -38,9 +38,9 @@ int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port); void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan); -int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, - struct net_bridge_port *p, - struct net_bridge_vlan_group *vg); +void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg); int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan); bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3dafb6143cff..1d80f34a139c 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -639,9 +639,9 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) return 0; } -clock_t br_get_ageing_time(struct net_device *br_dev) +clock_t br_get_ageing_time(const struct net_device *br_dev) { - struct net_bridge *br; + const struct net_bridge *br; if (!netif_is_bridge_master(br_dev)) return 0; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index a5e601e41cb9..6bf518d78f02 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -8,50 +8,65 @@ #include "br_private.h" -static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) -{ - struct net_bridge_port *p; +static struct static_key_false br_switchdev_tx_fwd_offload; - /* dev is yet to be added to the port list. */ - list_for_each_entry(p, &br->port_list, list) { - if (netdev_port_same_parent_id(dev, p->dev)) - return p->offload_fwd_mark; - } +static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - return ++br->offload_fwd_mark; + return (p->flags & BR_TX_FWD_OFFLOAD) && + (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } -int nbp_switchdev_mark_set(struct net_bridge_port *p) +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { - struct netdev_phys_item_id ppid = { }; - int err; + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - ASSERT_RTNL(); + return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; +} - err = dev_get_port_parent_id(p->dev, &ppid, true); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ + skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); +} - p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); +/* Mark the frame for TX forwarding offload if this egress port supports it */ +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; +} - return 0; +/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms + * that the skb has been already forwarded to, to avoid further cloning to + * other ports in the same hwdom by making nbp_switchdev_allowed_egress() + * return false. + */ +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { - if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) - BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; + if (p->hwdom) + BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { - return !skb->offload_fwd_mark || - BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; + struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); + + return !test_bit(p->hwdom, &cb->fwd_hwdoms) && + (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ @@ -108,8 +123,10 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, } void -br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +br_switchdev_fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type) { + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); struct switchdev_notifier_fdb_info info = { .addr = fdb->key.addr.addr, .vid = fdb->key.vlan_id, @@ -117,18 +134,16 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) .is_local = test_bit(BR_FDB_LOCAL, &fdb->flags), .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), }; - - if (!fdb->dst) - return; + struct net_device *dev = (!dst || info.is_local) ? br->dev : dst->dev; switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, - fdb->dst->dev, &info.info, NULL); + dev, &info.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, - fdb->dst->dev, &info.info, NULL); + dev, &info.info, NULL); break; } } @@ -156,3 +171,182 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) return switchdev_port_obj_del(dev, &v.obj); } + +static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) +{ + struct net_bridge *br = joining->br; + struct net_bridge_port *p; + int hwdom; + + /* joining is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { + joining->hwdom = p->hwdom; + return 0; + } + } + + hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); + if (hwdom >= BR_HWDOM_MAX) + return -EBUSY; + + set_bit(hwdom, &br->busy_hwdoms); + joining->hwdom = hwdom; + return 0; +} + +static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) +{ + struct net_bridge *br = leaving->br; + struct net_bridge_port *p; + + /* leaving is no longer in the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (p->hwdom == leaving->hwdom) + return; + } + + clear_bit(leaving->hwdom, &br->busy_hwdoms); +} + +static int nbp_switchdev_add(struct net_bridge_port *p, + struct netdev_phys_item_id ppid, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + int err; + + if (p->offload_count) { + /* Prevent unsupported configurations such as a bridge port + * which is a bonding interface, and the member ports are from + * different hardware switches. + */ + if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { + NL_SET_ERR_MSG_MOD(extack, + "Same bridge port cannot be offloaded by two physical switches"); + return -EBUSY; + } + + /* Tolerate drivers that call switchdev_bridge_port_offload() + * more than once for the same bridge port, such as when the + * bridge port is an offloaded bonding/team interface. + */ + p->offload_count++; + + return 0; + } + + p->ppid = ppid; + p->offload_count = 1; + + err = nbp_switchdev_hwdom_set(p); + if (err) + return err; + + if (tx_fwd_offload) { + p->flags |= BR_TX_FWD_OFFLOAD; + static_branch_inc(&br_switchdev_tx_fwd_offload); + } + + return 0; +} + +static void nbp_switchdev_del(struct net_bridge_port *p) +{ + if (WARN_ON(!p->offload_count)) + return; + + p->offload_count--; + + if (p->offload_count) + return; + + if (p->hwdom) + nbp_switchdev_hwdom_put(p); + + if (p->flags & BR_TX_FWD_OFFLOAD) { + p->flags &= ~BR_TX_FWD_OFFLOAD; + static_branch_dec(&br_switchdev_tx_fwd_offload); + } +} + +static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + int err; + + err = br_vlan_replay(br_dev, dev, ctx, true, blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_fdb_replay(br_dev, ctx, true, atomic_nb); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + + br_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_fdb_replay(br_dev, ctx, false, atomic_nb); +} + +/* Let the bridge know that this port is offloaded, so that it can assign a + * switchdev hardware domain to it. + */ +int br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + struct netdev_phys_item_id ppid; + int err; + + err = dev_get_port_parent_id(dev, &ppid, false); + if (err) + return err; + + err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); + if (err) + return err; + + err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); + if (err) + goto out_switchdev_del; + + return 0; + +out_switchdev_del: + nbp_switchdev_del(p); + + return err; +} + +void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); + + nbp_switchdev_del(p); +} diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 381467b691d5..d9a89ddd0331 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -384,13 +384,13 @@ static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_router); + return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_router(br, val); + return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, @@ -447,13 +447,13 @@ static ssize_t multicast_querier_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER)); + return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_querier(br, val); + return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, @@ -514,13 +514,13 @@ static ssize_t multicast_igmp_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_igmp_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_igmp_version(br, val); + return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, @@ -536,13 +536,13 @@ static ssize_t multicast_last_member_count_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_last_member_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; return 0; } @@ -558,13 +558,13 @@ static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_startup_query_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; return 0; } @@ -581,13 +581,13 @@ static ssize_t multicast_last_member_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_last_member_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } @@ -604,13 +604,13 @@ static ssize_t multicast_membership_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_membership_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } @@ -628,13 +628,13 @@ static ssize_t multicast_querier_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_querier_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } @@ -652,13 +652,13 @@ static ssize_t multicast_query_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); return 0; } @@ -676,13 +676,13 @@ static ssize_t multicast_query_response_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_response_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } @@ -700,13 +700,13 @@ static ssize_t multicast_startup_query_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_startup_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); return 0; } @@ -751,13 +751,13 @@ static ssize_t multicast_mld_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_mld_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_mld_version(br, val); + return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 72e92376eef1..07fa76080512 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -244,13 +244,13 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->multicast_router); + return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { - return br_multicast_set_port_router(p, v); + return br_multicast_set_port_router(&p->multicast_ctx, v); } static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index da3256a3eed0..19f65ab91a02 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -113,9 +113,7 @@ static void __vlan_add_list(struct net_bridge_vlan *v) headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); - if (v->vid < vent->vid) - continue; - else + if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); @@ -192,6 +190,8 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); + br_multicast_toggle_one_vlan(masterv, false); + br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } @@ -282,10 +282,13 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } else { v->stats = masterv->stats; } + br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err && err != -EOPNOTSUPP) goto out; + br_multicast_ctx_init(br, v, &v->br_mcast_ctx); + v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ @@ -308,6 +311,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, __vlan_add_list(v); __vlan_add_flags(v, flags); + br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); @@ -376,6 +380,8 @@ static int __vlan_del(struct net_bridge_vlan *v) br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); + br_multicast_toggle_one_vlan(v, false); + br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } @@ -459,7 +465,15 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, u64_stats_update_end(&stats->syncp); } - if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) + /* If the skb will be sent using forwarding offload, the assumption is + * that the switchdev will inject the packet into hardware together + * with the bridge VLAN, so that it can be forwarded according to that + * VLAN. The switchdev should deal with popping the VLAN header in + * hardware on each egress port as appropriate. So only strip the VLAN + * header if forwarding offload is not being used. + */ + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && + !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && @@ -475,7 +489,8 @@ out: static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, - u8 *state) + u8 *state, + struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; @@ -540,8 +555,9 @@ static bool __allowed_ingress(const struct net_bridge *br, */ skb->vlan_tci |= pvid; - /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + /* if snooping and stats are disabled we can avoid the lookup */ + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); return br_vlan_state_allowed(*state, true); @@ -568,6 +584,8 @@ static bool __allowed_ingress(const struct net_bridge *br, u64_stats_update_end(&stats->syncp); } + *vlan = v; + return true; drop: @@ -577,17 +595,19 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ + *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } - return __allowed_ingress(br, vg, skb, vid, state); + return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ @@ -674,6 +694,7 @@ static int br_vlan_add_existing(struct net_bridge *br, vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; + br_multicast_toggle_one_vlan(vlan, true); } if (__vlan_add_flags(vlan, flags)) @@ -820,14 +841,21 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; + br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); + err = switchdev_port_attr_set(br->dev, &attr, extack); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; + } - br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); + if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); + br_multicast_toggle_vlan_snooping(br, false, NULL); + } return 0; } @@ -1422,6 +1450,33 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, } EXPORT_SYMBOL_GPL(br_vlan_get_info); +int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + + p = br_port_get_check_rcu(dev); + if (p) + vg = nbp_vlan_group_rcu(p); + else if (netif_is_bridge_master(dev)) + vg = br_vlan_group_rcu(netdev_priv(dev)); + else + return -EINVAL; + + v = br_vlan_find(vg, vid); + if (!v) + return -ENOENT; + + p_vinfo->vid = vid; + p_vinfo->flags = v->flags; + if (vid == br_get_pvid(vg)) + p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); + static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && @@ -1809,33 +1864,40 @@ out_kfree: static int br_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, + const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, + .ctx = ctx, }, .obj = &vlan->obj, }; int err; - err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - struct notifier_block *nb, struct netlink_ext_ack *extack) + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; struct net_bridge *br; + unsigned long action; int err = 0; u16 pvid; ASSERT_RTNL(); + if (!nb) + return 0; + if (!netif_is_bridge_master(br_dev)) return -EINVAL; @@ -1857,6 +1919,11 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, if (!vg) return 0; + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { @@ -1870,14 +1937,13 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, if (!br_vlan_should_use(v)) continue; - err = br_vlan_replay_one(nb, dev, &vlan, extack); + err = br_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); if (err) return err; } return err; } -EXPORT_SYMBOL_GPL(br_vlan_replay); /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, @@ -1894,6 +1960,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; @@ -1912,6 +1979,10 @@ static int br_vlan_dump_dev(const struct net_device *dev, vg = br_vlan_group_rcu(br); p = NULL; } else { + /* global options are dumped only for bridge devices */ + if (dump_global) + return 0; + p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; @@ -1934,7 +2005,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { - if (!br_vlan_should_use(v)) + if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; @@ -1947,8 +2018,21 @@ static int br_vlan_dump_dev(const struct net_device *dev, continue; } - if (dump_stats || v->vid == pvid || - !br_vlan_can_enter_range(v, range_end)) { + if (dump_global) { + if (br_vlan_global_opts_can_enter_range(v, range_end)) + goto update_end; + if (!br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, + range_start)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } else if (dump_stats || v->vid == pvid || + !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, @@ -1962,6 +2046,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, range_start = v; } +update_end: range_end = v; } @@ -1970,11 +2055,18 @@ static int br_vlan_dump_dev(const struct net_device *dev, * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ - if (!err && range_start && - !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, - range_start, br_vlan_flags(range_start, pvid), - dump_stats)) - err = -EMSGSIZE; + if (!err && range_start) { + if (dump_global && + !br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, range_start)) + err = -EMSGSIZE; + else if (!dump_global && + !br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, range_start, + br_vlan_flags(range_start, pvid), + dump_stats)) + err = -EMSGSIZE; + } cb->args[1] = err ? idx : 0; @@ -2044,6 +2136,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, + [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, @@ -2178,12 +2271,22 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { - if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + switch (nla_type(attr)) { + case BRIDGE_VLANDB_ENTRY: + err = br_vlan_rtm_process_one(dev, attr, + nlh->nlmsg_type, + extack); + break; + case BRIDGE_VLANDB_GLOBAL_OPTIONS: + err = br_vlan_rtm_process_global_options(dev, attr, + nlh->nlmsg_type, + extack); + break; + default: continue; + } vlans++; - err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, - extack); if (err) break; } diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index b4add9ea8964..8ffd4ed2563c 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -40,22 +40,38 @@ static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { + u8 range_mc_rtr = br_vlan_multicast_router(range_end); + u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); + return v_curr->state == range_end->state && - __vlan_tun_can_enter_range(v_curr, range_end); + __vlan_tun_can_enter_range(v_curr, range_end) && + curr_mc_rtr == range_mc_rtr; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { - return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, - br_vlan_get_state(v)) && - __vlan_tun_put(skb, v); + if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || + !__vlan_tun_put(skb, v)) + return false; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, + br_vlan_multicast_router(v))) + return false; +#endif + + return true; } size_t br_vlan_opts_nl_size(void) { return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */ - + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ +#endif + + 0; } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, @@ -181,6 +197,18 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, return err; } +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) { + u8 val; + + val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]); + err = br_multicast_set_vlan_router(v, val); + if (err) + return err; + *changed = true; + } +#endif + return 0; } @@ -258,3 +286,392 @@ int br_vlan_process_options(const struct net_bridge *br, return err; } + +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end) +{ + return v_curr->vid - r_end->vid == 1 && + ((v_curr->priv_flags ^ r_end->priv_flags) & + BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 && + br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx, + &r_end->br_mcast_ctx); +} + +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts) +{ + struct nlattr *nest2 __maybe_unused; + u64 clockval __maybe_unused; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS); + if (!nest) + return false; + + if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid)) + goto out_err; + + if (vid_range && vid < vid_range && + nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range)) + goto out_err; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, + !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) || + nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, + v_opts->br_mcast_ctx.multicast_igmp_version) || + nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, + v_opts->br_mcast_ctx.multicast_last_member_count) || + nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, + v_opts->br_mcast_ctx.multicast_startup_query_count) || + nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, + v_opts->br_mcast_ctx.multicast_querier) || + br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE)) + goto out_err; + + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + + if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) { + nest2 = nla_nest_start(skb, + BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS); + if (!nest2) + goto out_err; + + rcu_read_lock(); + if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) { + rcu_read_unlock(); + nla_nest_cancel(skb, nest2); + goto out_err; + } + rcu_read_unlock(); + + nla_nest_end(skb, nest2); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, + v_opts->br_mcast_ctx.multicast_mld_version)) + goto out_err; +#endif +#endif + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */ + + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */ + + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ + + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ +#endif + + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ +} + +static void br_vlan_global_opts_notify(const struct net_bridge *br, + u16 vid, u16 vid_range) +{ + struct net_bridge_vlan *v; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + /* need to find the vlan due to flags/options */ + v = br_vlan_find(br_vlan_group(br), vid); + if (!v) + return; + + skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = br->dev->ifindex; + + if (!br_vlan_global_opts_fill(skb, vid, vid_range, v)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err); + kfree_skb(skb); +} + +static int br_vlan_process_global_one_opts(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + int err __maybe_unused; + + *changed = false; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) { + u8 mc_snooping; + + mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]); + if (br_multicast_toggle_global_vlan(v, !!mc_snooping)) + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) { + u8 ver; + + ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]); + err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver); + if (err) + return err; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) { + u32 cnt; + + cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]); + v->br_mcast_ctx.multicast_last_member_count = cnt; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) { + u32 cnt; + + cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]); + v->br_mcast_ctx.multicast_startup_query_count = cnt; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]); + v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]); + v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]); + v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]); + v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]); + v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]); + v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) { + u8 val; + + val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]); + err = br_multicast_set_querier(&v->br_mcast_ctx, val); + if (err) + return err; + *changed = true; + } +#if IS_ENABLED(CONFIG_IPV6) + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) { + u8 ver; + + ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]); + err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver); + if (err) + return err; + *changed = true; + } +#endif +#endif + + return 0; +} + +static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { + [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, + [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, + [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, +}; + +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1]; + struct net_bridge_vlan_group *vg; + u16 vid, vid_range = 0; + struct net_bridge *br; + int err = 0; + + if (cmd != RTM_NEWVLAN) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation"); + return -EINVAL; + } + if (!netif_is_bridge_master(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device"); + return -EINVAL; + } + br = netdev_priv(dev); + vg = br_vlan_group(br); + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr, + br_vlan_db_gpol, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_GOPTS_ID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id"); + return -EINVAL; + } + vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]); + if (!br_vlan_valid_id(vid, extack)) + return -EINVAL; + + if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) { + vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]); + if (!br_vlan_valid_id(vid_range, extack)) + return -EINVAL; + if (vid >= vid_range) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return -EINVAL; + } + } else { + vid_range = vid; + } + + for (; vid <= vid_range; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (!br_vlan_global_opts_can_enter_range(v, curr_end)) { + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); + + return err; +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 01017448ebde..6399a8a69d07 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -158,30 +158,28 @@ void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) rhashtable_destroy(&vg->tunnel_hash); } -int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, - struct net_bridge_port *p, - struct net_bridge_vlan_group *vg) +void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) - return 0; + return; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) - return 0; + return; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) - return 0; + return; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); - - return 0; } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 020b1487ee0c..a7af4eaff17d 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -98,7 +98,7 @@ static const struct nf_hook_ops ebt_ops_broute = { .priority = NF_BR_PRI_FIRST, }; -static int __net_init broute_net_init(struct net *net) +static int broute_table_init(struct net *net) { return ebt_register_table(net, &broute_table, &ebt_ops_broute); } @@ -114,19 +114,30 @@ static void __net_exit broute_net_exit(struct net *net) } static struct pernet_operations broute_net_ops = { - .init = broute_net_init, .exit = broute_net_exit, .pre_exit = broute_net_pre_exit, }; static int __init ebtable_broute_init(void) { - return register_pernet_subsys(&broute_net_ops); + int ret = ebt_register_template(&broute_table, broute_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&broute_net_ops); + if (ret) { + ebt_unregister_template(&broute_table); + return ret; + } + + return 0; } static void __exit ebtable_broute_fini(void) { unregister_pernet_subsys(&broute_net_ops); + ebt_unregister_template(&broute_table); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 8ec0b3736803..c0b121df4a9a 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -86,7 +86,7 @@ static const struct nf_hook_ops ebt_ops_filter[] = { }, }; -static int __net_init frame_filter_net_init(struct net *net) +static int frame_filter_table_init(struct net *net) { return ebt_register_table(net, &frame_filter, ebt_ops_filter); } @@ -102,19 +102,30 @@ static void __net_exit frame_filter_net_exit(struct net *net) } static struct pernet_operations frame_filter_net_ops = { - .init = frame_filter_net_init, .exit = frame_filter_net_exit, .pre_exit = frame_filter_net_pre_exit, }; static int __init ebtable_filter_init(void) { - return register_pernet_subsys(&frame_filter_net_ops); + int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&frame_filter_net_ops); + if (ret) { + ebt_unregister_template(&frame_filter); + return ret; + } + + return 0; } static void __exit ebtable_filter_fini(void) { unregister_pernet_subsys(&frame_filter_net_ops); + ebt_unregister_template(&frame_filter); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 7c8a1064a531..4078151c224f 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -85,7 +85,7 @@ static const struct nf_hook_ops ebt_ops_nat[] = { }, }; -static int __net_init frame_nat_net_init(struct net *net) +static int frame_nat_table_init(struct net *net) { return ebt_register_table(net, &frame_nat, ebt_ops_nat); } @@ -101,19 +101,30 @@ static void __net_exit frame_nat_net_exit(struct net *net) } static struct pernet_operations frame_nat_net_ops = { - .init = frame_nat_net_init, .exit = frame_nat_net_exit, .pre_exit = frame_nat_net_pre_exit, }; static int __init ebtable_nat_init(void) { - return register_pernet_subsys(&frame_nat_net_ops); + int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&frame_nat_net_ops); + if (ret) { + ebt_unregister_template(&frame_nat); + return ret; + } + + return ret; } static void __exit ebtable_nat_fini(void) { unregister_pernet_subsys(&frame_nat_net_ops); + ebt_unregister_template(&frame_nat); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f022deb3721e..83d1798dfbb4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -44,7 +44,16 @@ struct ebt_pernet { struct list_head tables; }; +struct ebt_template { + struct list_head list; + char name[EBT_TABLE_MAXNAMELEN]; + struct module *owner; + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); +}; + static unsigned int ebt_pernet_id __read_mostly; +static LIST_HEAD(template_tables); static DEFINE_MUTEX(ebt_mutex); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -309,30 +318,57 @@ letscontinue: /* If it succeeds, returns element and locks mutex */ static inline void * -find_inlist_lock_noload(struct list_head *head, const char *name, int *error, +find_inlist_lock_noload(struct net *net, const char *name, int *error, struct mutex *mutex) { - struct { - struct list_head list; - char name[EBT_FUNCTION_MAXNAMELEN]; - } *e; + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_template *tmpl; + struct ebt_table *table; mutex_lock(mutex); - list_for_each_entry(e, head, list) { - if (strcmp(e->name, name) == 0) - return e; + list_for_each_entry(table, &ebt_net->tables, list) { + if (strcmp(table->name, name) == 0) + return table; } + + list_for_each_entry(tmpl, &template_tables, list) { + if (strcmp(name, tmpl->name) == 0) { + struct module *owner = tmpl->owner; + + if (!try_module_get(owner)) + goto out; + + mutex_unlock(mutex); + + *error = tmpl->table_init(net); + if (*error) { + module_put(owner); + return NULL; + } + + mutex_lock(mutex); + module_put(owner); + break; + } + } + + list_for_each_entry(table, &ebt_net->tables, list) { + if (strcmp(table->name, name) == 0) + return table; + } + +out: *error = -ENOENT; mutex_unlock(mutex); return NULL; } static void * -find_inlist_lock(struct list_head *head, const char *name, const char *prefix, +find_inlist_lock(struct net *net, const char *name, const char *prefix, int *error, struct mutex *mutex) { return try_then_request_module( - find_inlist_lock_noload(head, name, error, mutex), + find_inlist_lock_noload(net, name, error, mutex), "%s%s", prefix, name); } @@ -340,10 +376,7 @@ static inline struct ebt_table * find_table_lock(struct net *net, const char *name, int *error, struct mutex *mutex) { - struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); - - return find_inlist_lock(&ebt_net->tables, name, - "ebtable_", error, mutex); + return find_inlist_lock(net, name, "ebtable_", error, mutex); } static inline void ebt_free_table_info(struct ebt_table_info *info) @@ -1258,6 +1291,54 @@ out: return ret; } +int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net)) +{ + struct ebt_template *tmpl; + + mutex_lock(&ebt_mutex); + list_for_each_entry(tmpl, &template_tables, list) { + if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) { + mutex_unlock(&ebt_mutex); + return -EEXIST; + } + } + + tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL); + if (!tmpl) { + mutex_unlock(&ebt_mutex); + return -ENOMEM; + } + + tmpl->table_init = table_init; + strscpy(tmpl->name, t->name, sizeof(tmpl->name)); + tmpl->owner = t->me; + list_add(&tmpl->list, &template_tables); + + mutex_unlock(&ebt_mutex); + return 0; +} +EXPORT_SYMBOL(ebt_register_template); + +void ebt_unregister_template(const struct ebt_table *t) +{ + struct ebt_template *tmpl; + + mutex_lock(&ebt_mutex); + list_for_each_entry(tmpl, &template_tables, list) { + if (strcmp(t->name, tmpl->name)) + continue; + + list_del(&tmpl->list); + mutex_unlock(&ebt_mutex); + kfree(tmpl); + return; + } + + mutex_unlock(&ebt_mutex); + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL(ebt_unregister_template); + static struct ebt_table *__ebt_find_table(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 8d033a75a766..fdbed3158555 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -88,6 +88,12 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, skb = ip_fraglist_next(&iter); } + + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; } slow_path: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 3ad0a1df6712..e12fd3cad619 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -243,7 +243,7 @@ static void caif_ctrl_cb(struct cflayer *layr, cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; cf_sk->sk.sk_err = ECONNRESET; set_rx_flow_on(cf_sk); - cf_sk->sk.sk_error_report(&cf_sk->sk); + sk_error_report(&cf_sk->sk); break; default: @@ -539,7 +539,8 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, goto err; ret = -EINVAL; - if (unlikely(msg->msg_iter.iov->iov_base == NULL)) + if (unlikely(msg->msg_iter.nr_segs == 0) || + unlikely(msg->msg_iter.iov->iov_base == NULL)) goto err; noblock = msg->msg_flags & MSG_DONTWAIT; diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index cac30e676ac9..23267c8db7c4 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -480,7 +480,7 @@ got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; - goto out_err; + goto out; } phy_layer->id = phyid; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index fadc7c8a3107..37b67194c0df 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -76,8 +76,6 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) u8 buf; priv = container_of(layr, struct chnl_net, chnl); - if (!priv) - return -EINVAL; skb = (struct sk_buff *) cfpkt_tonative(pkt); diff --git a/net/can/bcm.c b/net/can/bcm.c index f3e4d9528fa3..508f67de0b80 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -785,6 +785,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, bcm_rx_handler, op); list_del(&op->list); + synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } @@ -1417,7 +1418,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } break; @@ -1425,7 +1426,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } } } @@ -1533,9 +1534,13 @@ static int bcm_release(struct socket *sock) REGMASK(op->can_id), bcm_rx_handler, op); - bcm_remove_op(op); } + synchronize_rcu(); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) + bcm_remove_op(op); + #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) diff --git a/net/can/gw.c b/net/can/gw.c index ba4124805602..d8861e862f15 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -596,6 +596,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1154,6 +1155,7 @@ static void cgw_remove_all_jobs(struct net *net) hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1222,6 +1224,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); err = 0; break; diff --git a/net/can/isotp.c b/net/can/isotp.c index be6183f8ca11..caaa532ece94 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -168,7 +168,7 @@ static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; @@ -225,8 +225,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) can_send_ret = can_send(nskb, 1); if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); dev_put(dev); @@ -339,7 +339,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); @@ -392,7 +392,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); fallthrough; default: @@ -420,7 +420,7 @@ static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 1; } @@ -535,7 +535,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; @@ -559,7 +559,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 1; } @@ -758,7 +758,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; @@ -801,10 +801,12 @@ isotp_tx_burst: can_skb_set_owner(skb, sk); can_send_ret = can_send(skb, 1); - if (can_send_ret) - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); - + if (can_send_ret) { + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(can_send_ret)); + if (can_send_ret == -ENOBUFS) + pr_notice_once("can-isotp: tx queue is full, increasing txqueuelen may prevent this error\n"); + } if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; @@ -950,8 +952,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = can_send(skb, 1); dev_put(dev); if (err) { - pr_notice_once("can-isotp: %s: can_send_ret %d\n", - __func__, err); + pr_notice_once("can-isotp: %s: can_send_ret %pe\n", + __func__, ERR_PTR(err)); return err; } @@ -1028,9 +1030,6 @@ static int isotp_release(struct socket *sock) lock_sock(sk); - hrtimer_cancel(&so->txtimer); - hrtimer_cancel(&so->rxtimer); - /* remove current filters & unregister */ if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { if (so->ifindex) { @@ -1042,10 +1041,14 @@ static int isotp_release(struct socket *sock) SINGLE_MASK(so->rxid), isotp_rcv, sk); dev_put(dev); + synchronize_rcu(); } } } + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + so->ifindex = 0; so->bound = 0; @@ -1155,7 +1158,7 @@ out: if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } return err; @@ -1354,13 +1357,13 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; } } @@ -1482,7 +1485,7 @@ static __init int isotp_module_init(void) err = can_proto_register(&isotp_can_proto); if (err < 0) - pr_err("can: registration of isotp protocol failed\n"); + pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 12369b604ce9..f6df20808f5e 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -20,9 +20,12 @@ struct j1939_session; enum j1939_sk_errqueue_type { - J1939_ERRQUEUE_ACK, - J1939_ERRQUEUE_SCHED, - J1939_ERRQUEUE_ABORT, + J1939_ERRQUEUE_TX_ACK, + J1939_ERRQUEUE_TX_SCHED, + J1939_ERRQUEUE_TX_ABORT, + J1939_ERRQUEUE_RX_RTS, + J1939_ERRQUEUE_RX_DPO, + J1939_ERRQUEUE_RX_ABORT, }; /* j1939 devices */ @@ -87,6 +90,7 @@ struct j1939_priv { struct list_head j1939_socks; struct kref rx_kref; + u32 rx_tskey; }; void j1939_ecu_put(struct j1939_ecu *ecu); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index da3a7a7bcff2..08c8606cfd9c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -193,6 +193,10 @@ static void j1939_can_rx_unregister(struct j1939_priv *priv) can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); + /* The last reference of priv is dropped by the RCU deferred + * j1939_sk_sock_destruct() of the last socket, so we can + * safely drop this reference here. + */ j1939_priv_put(priv); } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 56aa66147d5a..6dff4510687a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -352,7 +352,7 @@ static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); - /* This function will be call by the generic networking code, when then + /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between @@ -398,6 +398,9 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + + /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ + sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; @@ -673,7 +676,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (!sockptr_is_null(optval)) { + if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; @@ -902,20 +905,33 @@ failure: return NULL; } -static size_t j1939_sk_opt_stats_get_size(void) +static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { - return - nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ - 0; + switch (type) { + case J1939_ERRQUEUE_RX_RTS: + return + nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ + nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ + nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ + nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ + nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ + nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ + 0; + default: + return + nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ + 0; + } } static struct sk_buff * -j1939_sk_get_timestamping_opt_stats(struct j1939_session *session) +j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, + enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; - stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC); + stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; @@ -925,32 +941,67 @@ j1939_sk_get_timestamping_opt_stats(struct j1939_session *session) size = min(session->pkt.tx_acked * 7, session->total_message_size); - nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); + switch (type) { + case J1939_ERRQUEUE_RX_RTS: + nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, + session->total_message_size); + nla_put_u32(stats, J1939_NLA_PGN, + session->skcb.addr.pgn); + nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, + session->skcb.addr.src_name, J1939_NLA_PAD); + nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, + session->skcb.addr.dst_name, J1939_NLA_PAD); + nla_put_u8(stats, J1939_NLA_SRC_ADDR, + session->skcb.addr.sa); + nla_put_u8(stats, J1939_NLA_DEST_ADDR, + session->skcb.addr.da); + break; + default: + nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); + } return stats; } -void j1939_sk_errqueue(struct j1939_session *session, - enum j1939_sk_errqueue_type type) +static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; - struct sock *sk = session->sk; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; int err; - /* currently we have no sk for the RX session */ - if (!sk) - return; - jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; - skb = j1939_sk_get_timestamping_opt_stats(session); + switch (type) { + case J1939_ERRQUEUE_TX_ACK: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + return; + break; + case J1939_ERRQUEUE_TX_SCHED: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + return; + break; + case J1939_ERRQUEUE_TX_ABORT: + break; + case J1939_ERRQUEUE_RX_RTS: + fallthrough; + case J1939_ERRQUEUE_RX_DPO: + fallthrough; + case J1939_ERRQUEUE_RX_ABORT: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + return; + break; + default: + netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); + } + + skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; @@ -961,36 +1012,42 @@ void j1939_sk_errqueue(struct j1939_session *session, serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { - case J1939_ERRQUEUE_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) { - kfree_skb(skb); - return; - } - + case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; - state = "ACK"; + state = "TX ACK"; break; - case J1939_ERRQUEUE_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) { - kfree_skb(skb); - return; - } - + case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; - state = "SCH"; + state = "TX SCH"; break; - case J1939_ERRQUEUE_ABORT: + case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; - state = "ABT"; + state = "TX ABT"; + break; + case J1939_ERRQUEUE_RX_RTS: + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_RTS; + state = "RX RTS"; + break; + case J1939_ERRQUEUE_RX_DPO: + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_DPO; + state = "RX DPO"; + break; + case J1939_ERRQUEUE_RX_ABORT: + serr->ee.ee_errno = session->err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; + state = "RX ABT"; break; - default: - netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } serr->opt_stats = true; @@ -1005,11 +1062,32 @@ void j1939_sk_errqueue(struct j1939_session *session, kfree_skb(skb); }; +void j1939_sk_errqueue(struct j1939_session *session, + enum j1939_sk_errqueue_type type) +{ + struct j1939_priv *priv = session->priv; + struct j1939_sock *jsk; + + if (session->sk) { + /* send TX notifications to the socket of origin */ + __j1939_sk_errqueue(session, session->sk, type); + return; + } + + /* spread RX notifications to all sockets subscribed to this session */ + spin_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + if (j1939_sk_recv_match_one(jsk, &session->skcb)) + __j1939_sk_errqueue(session, &jsk->sk, type); + } + spin_unlock_bh(&priv->j1939_socks_lock); +}; + void j1939_sk_send_loop_abort(struct sock *sk, int err) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, @@ -1189,7 +1267,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) - jsk->sk.sk_error_report(&jsk->sk); + sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index c3946c355882..bb5c4b8979be 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,10 +260,14 @@ static void __j1939_session_drop(struct j1939_session *session) static void j1939_session_destroy(struct j1939_session *session) { - if (session->err) - j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); - else - j1939_sk_errqueue(session, J1939_ERRQUEUE_ACK); + if (session->transmission) { + if (session->err) + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK); + } else if (session->err) { + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); + } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); @@ -776,7 +780,7 @@ static int j1939_session_tx_dpo(struct j1939_session *session) static int j1939_session_tx_dat(struct j1939_session *session) { struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *se_skcb; int offset, pkt_done, pkt_end; unsigned int len, pdelay; struct sk_buff *se_skb; @@ -788,7 +792,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (!se_skb) return -ENOBUFS; - skcb = j1939_skb_to_cb(se_skb); + se_skcb = j1939_skb_to_cb(se_skb); tpdat = se_skb->data; ret = 0; pkt_done = 0; @@ -800,7 +804,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) while (session->pkt.tx < pkt_end) { dat[0] = session->pkt.tx - session->pkt.dpo + 1; - offset = (session->pkt.tx * 7) - skcb->offset; + offset = (session->pkt.tx * 7) - se_skcb->offset; len = se_skb->len - offset; if (len > 7) len = 7; @@ -808,7 +812,8 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (offset + len > se_skb->len) { netdev_err_once(priv->ndev, "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", - __func__, session, skcb->offset, se_skb->len , session->pkt.tx); + __func__, session, se_skcb->offset, + se_skb->len , session->pkt.tx); ret = -EOVERFLOW; goto out_free; } @@ -821,7 +826,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) memcpy(&dat[1], &tpdat[offset], len); ret = j1939_tp_tx_dat(session, dat, len + 1); if (ret < 0) { - /* ENOBUS == CAN interface TX queue is full */ + /* ENOBUFS == CAN interface TX queue is full */ if (ret != -ENOBUFS) netdev_alert(priv->ndev, "%s: 0x%p: queue data error: %i\n", @@ -1043,7 +1048,7 @@ static int j1939_simple_txnext(struct j1939_session *session) if (ret) goto out_free; - j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED); + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED); j1939_sk_queue_activate_next(session); out_free: @@ -1075,11 +1080,16 @@ static bool j1939_session_deactivate_locked(struct j1939_session *session) static bool j1939_session_deactivate(struct j1939_session *session) { + struct j1939_priv *priv = session->priv; bool active; - j1939_session_list_lock(session->priv); + j1939_session_list_lock(priv); + /* This function should be called with a session ref-count of at + * least 2. + */ + WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); - j1939_session_list_unlock(session->priv); + j1939_session_list_unlock(priv); return active; } @@ -1092,7 +1102,7 @@ j1939_session_deactivate_activate_next(struct j1939_session *session) } static void __j1939_session_cancel(struct j1939_session *session, - enum j1939_xtp_abort err) + enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; @@ -1110,6 +1120,8 @@ static void __j1939_session_cancel(struct j1939_session *session, if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } static void j1939_session_cancel(struct j1939_session *session, @@ -1190,13 +1202,13 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) static void j1939_session_completed(struct j1939_session *session) { - struct sk_buff *skb; + struct sk_buff *se_skb; if (!session->transmission) { - skb = j1939_session_skb_get(session); + se_skb = j1939_session_skb_get(session); /* distribute among j1939 receivers */ - j1939_sk_recv(session->priv, skb); - consume_skb(skb); + j1939_sk_recv(session->priv, se_skb); + consume_skb(se_skb); } j1939_session_deactivate_activate_next(session); @@ -1263,12 +1275,14 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_RTS: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: abort = J1939_XTP_ABORT_BUSY; break; case J1939_ETP_CMD_CTS: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN; break; @@ -1277,7 +1291,8 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_EOMA: - case J1939_TP_CMD_EOMA: /* fall through */ + fallthrough; + case J1939_TP_CMD_EOMA: abort = J1939_XTP_ABORT_OTHER; break; @@ -1321,6 +1336,8 @@ static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb, session->err = j1939_xtp_abort_to_errno(priv, abort); if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); j1939_session_deactivate_activate_next(session); abort_put: @@ -1429,7 +1446,7 @@ j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) if (session->transmission) { if (session->pkt.tx_acked) j1939_sk_errqueue(session, - J1939_ERRQUEUE_SCHED); + J1939_ERRQUEUE_TX_SCHED); j1939_session_txtimer_cancel(session); j1939_tp_schedule_txtimer(session, 0); } @@ -1621,6 +1638,9 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, session->pkt.rx = 0; session->pkt.tx = 0; + session->tskey = priv->rx_tskey++; + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS); + WARN_ON_ONCE(j1939_session_activate(session)); return session; @@ -1743,6 +1763,9 @@ static void j1939_xtp_rx_dpo_one(struct j1939_session *session, session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data); session->last_cmd = dat[0]; j1939_tp_set_rxtimeout(session, 750); + + if (!session->transmission) + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO); } static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, @@ -1767,7 +1790,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; const u8 *dat; u8 *tpdat; @@ -1792,7 +1815,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, break; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: if (skcb->addr.type != J1939_ETP) break; fallthrough; @@ -1817,8 +1841,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, goto out_session_cancel; } - skcb = j1939_skb_to_cb(se_skb); - offset = packet * 7 - skcb->offset; + se_skcb = j1939_skb_to_cb(se_skb); + offset = packet * 7 - se_skcb->offset; nbytes = se_skb->len - offset; if (nbytes > 7) nbytes = 7; @@ -1846,7 +1870,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (packet == session->pkt.rx) session->pkt.rx++; - if (skcb->addr.type != J1939_ETP && + if (se_skcb->addr.type != J1939_ETP && j1939_cb_is_broadcast(&session->skcb)) { if (session->pkt.rx >= session->pkt.total) final = true; @@ -1869,7 +1893,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (!session->transmission) j1939_tp_schedule_txtimer(session, 0); } else { - j1939_tp_set_rxtimeout(session, 250); + j1939_tp_set_rxtimeout(session, 750); } session->last_cmd = 0xff; consume_skb(se_skb); @@ -1995,7 +2019,8 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) return; diff --git a/net/can/proc.c b/net/can/proc.c index d1fe49e6f16d..b3099f0a3cb8 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -99,8 +99,6 @@ static void can_init_stats(struct net *net) static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, unsigned long count) { - unsigned long rate; - if (oldjif == newjif) return 0; @@ -111,9 +109,7 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, return 99999999; } - rate = (count * HZ) / (newjif - oldjif); - - return rate; + return (count * HZ) / (newjif - oldjif); } void can_stat_update(struct timer_list *t) diff --git a/net/can/raw.c b/net/can/raw.c index ac96fc210025..7105fa4824e4 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -295,13 +295,13 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg, sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); break; } } @@ -488,7 +488,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } return err; @@ -546,10 +546,18 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; } + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(sock_net(sk), ro->ifindex); + if (!dev) { + if (count > 1) + kfree(filter); + err = -ENODEV; + goto out_fil; + } + } if (ro->bound) { /* (try to) register the new filters */ @@ -584,10 +592,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->count = count; out_fil: - if (dev) - dev_put(dev); - + dev_put(dev); release_sock(sk); + rtnl_unlock(); break; @@ -600,10 +607,16 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, err_mask &= CAN_ERR_MASK; + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(sock_net(sk), ro->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + } /* remove current error mask */ if (ro->bound) { @@ -623,10 +636,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->err_mask = err_mask; out_err: - if (dev) - dev_put(dev); - + dev_put(dev); release_sock(sk); + rtnl_unlock(); break; diff --git a/net/ceph/auth.c b/net/ceph/auth.c index d2b268a1838e..d38c9eadbe2f 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -58,12 +58,10 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const int *con_modes) { struct ceph_auth_client *ac; - int ret; - ret = -ENOMEM; ac = kzalloc(sizeof(*ac), GFP_NOFS); if (!ac) - goto out; + return ERR_PTR(-ENOMEM); mutex_init(&ac->mutex); ac->negotiating = true; @@ -78,9 +76,6 @@ struct ceph_auth_client *ceph_auth_init(const char *name, dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__, ac->name, ac->preferred_mode, ac->fallback_mode); return ac; - -out: - return ERR_PTR(ret); } void ceph_auth_destroy(struct ceph_auth_client *ac) diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 097e9f8d87a7..77b5519bc45f 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -112,8 +112,8 @@ static int ceph_auth_none_create_authorizer( auth->authorizer = (struct ceph_authorizer *) au; auth->authorizer_buf = au->buf; auth->authorizer_buf_len = au->buf_len; - auth->authorizer_reply_buf = au->reply_buf; - auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + auth->authorizer_reply_buf = NULL; + auth->authorizer_reply_buf_len = 0; return 0; } diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index 4158f064302e..bb121539e796 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -16,7 +16,6 @@ struct ceph_none_authorizer { struct ceph_authorizer base; char buf[128]; int buf_len; - char reply_buf[0]; }; struct ceph_auth_none_info { diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 792fcb974dc3..9c60feeb1bcb 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -87,7 +87,7 @@ struct ceph_x_authorize_reply { /* - * encyption bundle + * encryption bundle */ #define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 17447c19d937..66136a4c1ce7 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -10,7 +10,9 @@ /** * ceph_cls_lock - grab rados lock for object - * @oid, @oloc: object to lock + * @osdc: OSD client instance + * @oid: object to lock + * @oloc: object to lock * @lock_name: the name of the lock * @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED) * @cookie: user-defined identifier for this instance of the lock @@ -82,7 +84,9 @@ EXPORT_SYMBOL(ceph_cls_lock); /** * ceph_cls_unlock - release rados lock for object - * @oid, @oloc: object to lock + * @osdc: OSD client instance + * @oid: object to lock + * @oloc: object to lock * @lock_name: the name of the lock * @cookie: user-defined identifier for this instance of the lock */ @@ -130,7 +134,9 @@ EXPORT_SYMBOL(ceph_cls_unlock); /** * ceph_cls_break_lock - release rados lock for object for specified client - * @oid, @oloc: object to lock + * @osdc: OSD client instance + * @oid: object to lock + * @oloc: object to lock * @lock_name: the name of the lock * @cookie: user-defined identifier for this instance of the lock * @locker: current lock owner diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 195ceb8afb06..013cbdb6cfe2 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1508,7 +1508,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, return get_generic_reply(con, hdr, skip); /* - * Older OSDs don't set reply tid even if the orignal + * Older OSDs don't set reply tid even if the original * request had a non-zero tid. Work around this weirdness * by allocating a new message. */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c959320c4775..75b738083523 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1309,7 +1309,7 @@ static int get_osdmap_client_data_v(void **p, void *end, return -EINVAL; } - /* old osdmap enconding */ + /* old osdmap encoding */ struct_v = 0; } @@ -3010,7 +3010,7 @@ static bool is_valid_crush_name(const char *name) * parent, returns 0. * * Does a linear search, as there are no parent pointers of any - * kind. Note that the result is ambigous for items that occur + * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, diff --git a/net/core/Makefile b/net/core/Makefile index f7f16650fe9e..35ced6201814 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o -ifeq ($(CONFIG_INET),y) obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o -endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cc3712ad8716..68d2cbf8331a 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -416,7 +416,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); @@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); @@ -524,8 +524,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) nr_maps++; } - diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, - GFP_KERNEL); + diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); diff --git a/net/core/dev.c b/net/core/dev.c index ef8cf7619baf..74fd402d26dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -131,6 +131,7 @@ #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> +#include <trace/events/qdisc.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -148,6 +149,7 @@ #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> +#include <linux/once_lite.h> #include "net-sysfs.h" @@ -674,131 +676,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/****************************************************************************** - * - * Device Boot-time Settings Routines - * - ******************************************************************************/ - -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; - -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; - - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } - - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; -} - -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) -{ - struct netdev_boot_setup *s = dev_boot_setup; - int i; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} -EXPORT_SYMBOL(netdev_boot_setup_check); - - -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) -{ - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; - - sprintf(name, "%s%d", prefix, unit); - - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; -} - -/* - * Saves at boot time configured settings for any netdevice. - */ -int __init netdev_boot_setup(char *str) -{ - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; - - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} - -__setup("netdev=", netdev_boot_setup); - /******************************************************************************* * * Device Interface Subroutines @@ -954,8 +831,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -1028,8 +904,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -3097,6 +2972,50 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues @@ -3188,7 +3107,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq); void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { - if (in_irq() || irqs_disabled()) + if (in_hardirq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); @@ -3487,13 +3406,16 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) +{ + pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + skb_dump(KERN_ERR, skb, true); + dump_stack(); +} + void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - if (net_ratelimit()) { - pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); - skb_dump(KERN_ERR, skb, true); - dump_stack(); - } + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif @@ -3840,6 +3762,18 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) } } +static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, + struct sk_buff **to_free, + struct netdev_queue *txq) +{ + int rc; + + rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; + if (rc == NET_XMIT_SUCCESS) + trace_qdisc_enqueue(q, txq, skb); + return rc; +} + static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) @@ -3852,10 +3786,32 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - if (likely(!netif_xmit_frozen_or_stopped(txq))) - qdisc_run(q); + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + __qdisc_run(q); + qdisc_run_end(q); + + goto no_lock_out; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + qdisc_run_end(q); + return NET_XMIT_SUCCESS; + } + + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + qdisc_run(q); +no_lock_out: if (unlikely(to_free)) kfree_skb_list(to_free); return rc; @@ -3896,7 +3852,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3973,7 +3929,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -4363,7 +4319,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, * makes sure to proceed with napi polling * if the thread is explicitly woken from here. */ - if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE) + if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; @@ -4717,45 +4673,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - u32 metalen, act = XDP_DROP; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; + u32 metalen, act; int off; - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_is_redirected(skb)) - return XDP_PASS; - - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. - */ - if (skb_cloned(skb) || skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) - goto do_drop; - } - /* The XDP program wants to see the packet starting at the MAC * header. */ @@ -4810,6 +4739,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, skb->dev); } + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -4820,6 +4756,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (metalen) skb_metadata_set(skb, metalen); break; + } + + return act; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (skb_linearize(skb)) + goto do_drop; + } + + act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; default: bpf_warn_invalid_xdp_action(act); fallthrough; @@ -5102,8 +5081,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, - &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -5277,15 +5255,14 @@ another_round: if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; - preempt_disable(); + migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - preempt_enable(); + migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } - skb_reset_mac_len(skb); } if (eth_type_vlan(skb->protocol)) { @@ -5611,25 +5588,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; - if (new) { - u32 i; - - mutex_lock(&new->aux->used_maps_mutex); - - /* generic XDP does not work with DEVMAPs that can - * have a bpf_prog installed on an entry - */ - for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i]) || - cpu_map_prog_allowed(new->aux->used_maps[i])) { - mutex_unlock(&new->aux->used_maps_mutex); - return -EINVAL; - } - } - - mutex_unlock(&new->aux->used_maps_mutex); - } - switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -5837,7 +5795,7 @@ static void flush_all_backlogs(void) */ ASSERT_RTNL(); - get_online_cpus(); + cpus_read_lock(); cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { @@ -5855,7 +5813,7 @@ static void flush_all_backlogs(void) for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); - put_online_cpus(); + cpus_read_unlock(); } /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ @@ -5972,7 +5930,6 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), @@ -5981,6 +5938,32 @@ static void gro_list_prepare(const struct list_head *head, diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); + + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + } + NAPI_GRO_CB(p)->same_flow = !diffs; } } @@ -6194,6 +6177,8 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); else __kfree_skb_defer(skb); break; @@ -6242,7 +6227,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - skb_ext_reset(skb); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } napi->skb = skb; } @@ -6520,11 +6510,18 @@ EXPORT_SYMBOL(napi_schedule_prep); * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * - * Variant of __napi_schedule() assuming hard irqs are masked + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { - ____napi_schedule(this_cpu_ptr(&softnet_data), n); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); @@ -7535,7 +7532,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, { struct netdev_adjacent *lower; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); @@ -9300,7 +9297,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } -static u8 dev_xdp_prog_count(struct net_device *dev) +u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; @@ -9310,6 +9307,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev) count++; return count; } +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9403,6 +9401,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; @@ -9441,6 +9441,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; + } + } + cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { @@ -9650,14 +9658,17 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct net_device *dev; int err, fd; + rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); - if (!dev) + if (!dev) { + rtnl_unlock(); return -EINVAL; + } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; - goto out_put_dev; + goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); @@ -9667,14 +9678,14 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); - goto out_put_dev; + goto unlock; } - rtnl_lock(); err = dev_xdp_attach_link(dev, NULL, link); rtnl_unlock(); if (err) { + link->dev = NULL; bpf_link_cleanup(&link_primer); goto out_put_dev; } @@ -9684,6 +9695,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) dev_put(dev); return fd; +unlock: + rtnl_unlock(); + out_put_dev: dev_put(dev); return err; @@ -10066,7 +10080,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -10133,7 +10147,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -10773,7 +10787,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 45ae6eeb2964..8c39283c26ae 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -16,10 +16,9 @@ * General list handling functions */ -static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, - bool sync) +static struct netdev_hw_addr* +__hw_addr_create(const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -29,32 +28,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) - return -ENOMEM; + return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; + return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, - int sync_count) + int sync_count, bool exclusive) { + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; - list_for_each_entry(ha, &list->list, list) { - if (ha->type == addr_type && - !memcmp(ha->addr, addr, addr_len)) { + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + parent = *ins_point; + if (diff < 0) { + ins_point = &parent->rb_left; + } else if (diff > 0) { + ins_point = &parent->rb_right; + } else { + if (exclusive) + return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) @@ -73,8 +84,25 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, - sync); + ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); + if (!ha) + return -ENOMEM; + + /* The first address in dev->dev_addrs is pointed to by dev->dev_addr + * and mutated freely by device drivers and netdev ops, so if we insert + * it into the tree we'll end up with an invalid rbtree. + */ + if (list->count > 0) { + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); + } else { + RB_CLEAR_NODE(&ha->node); + } + + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + + return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, @@ -82,7 +110,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, - 0); + 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -103,24 +131,61 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; + + if (!RB_EMPTY_NODE(&ha->node)) + rb_erase(&ha->node, &list->tree); + list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } +static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + struct rb_node *node; + + /* The first address isn't inserted into the tree because in the dev->dev_addrs + * list it's the address pointed to by dev->dev_addr which is freely mutated + * in place, so we need to check it separately. + */ + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + return ha; + + node = list->tree.rb_node; + + while (node) { + struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); + int diff = memcmp(addr, ha->addr, addr_len); + + if (diff == 0 && addr_type) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + if (diff < 0) + node = node->rb_left; + else if (diff > 0) + node = node->rb_right; + else + return ha; + } + + return NULL; +} + static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { - struct netdev_hw_addr *ha; + struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) - return __hw_addr_del_entry(list, ha, global, sync); - } - return -ENOENT; + if (!ha) + return -ENOENT; + return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, @@ -137,7 +202,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, - false, true, ha->sync_cnt); + false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; @@ -407,6 +472,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; + list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); @@ -418,6 +484,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; + list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); @@ -552,22 +619,14 @@ EXPORT_SYMBOL(dev_addr_del); */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->uc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_UNICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true, false); + err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -745,22 +804,14 @@ EXPORT_SYMBOL(dev_uc_init); */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->mc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_MULTICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true, false); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -773,7 +824,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global, false, 0); + NETDEV_HW_ADDR_T_MULTICAST, global, false, + 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 478d032f34ac..0e87237fd871 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <linux/if_bridge.h> #include <net/dsa.h> #include <net/wext.h> @@ -25,79 +27,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr) return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); - /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ - -int dev_ifconf(struct net *net, struct ifconf *ifc, int size) +int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + void __user *pos; + size_t size; + int len, total = 0, done; - /* - * Fetch the caller's info block. - */ + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; - pos = ifc->ifc_buf; - len = ifc->ifc_len; + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; - /* - * Loop over the interfaces, and write an info block for each. - */ + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; - total = 0; + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } + + /* Loop over the interfaces, and write an info block for each. */ + rtnl_lock(); for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0, size); - else - done = gifconf_list[i](dev, pos + total, - len - total, size); - if (done < 0) - return -EFAULT; - total += done; - } + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_unlock(); + return -EFAULT; } + total += done; } + rtnl_unlock(); - /* - * All done. Write the updated control block back to the caller. - */ - ifc->ifc_len = total; + return put_user(total, &uifc->ifc_len); +} + +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; - /* - * Both BSD and Solaris return 0 here, so we do too. - */ return 0; } +static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); +} + /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ @@ -128,13 +159,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm break; case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; @@ -215,19 +240,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } -static int dev_do_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) +static int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; int err; - err = dsa_ndo_do_ioctl(dev, ifr, cmd); + err = dsa_ndo_eth_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) return err; - if (ops->ndo_do_ioctl) { + if (ops->ndo_eth_ioctl) { if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); + err = ops->ndo_eth_ioctl(dev, ifr, cmd); else err = -ENODEV; } @@ -235,10 +260,55 @@ static int dev_do_ioctl(struct net_device *dev, return err; } +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + if (netif_device_present(dev)) + return ops->ndo_siocbond(dev, ifr, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + if (netif_device_present(dev)) + return ops->ndo_siocdevprivate(dev, ifr, data, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + if (netif_device_present(dev)) + return ops->ndo_siocwandev(dev, ifs); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); @@ -275,12 +345,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + return dev_setifmap(dev, ifr); case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -307,6 +372,22 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!netif_device_present(dev)) + return -ENODEV; + if (!netif_is_bridge_master(dev)) + return -EOPNOTSUPP; + dev_hold(dev); + rtnl_unlock(); + err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); + dev_put(dev); + rtnl_lock(); + return err; + case SIOCSHWTSTAMP: err = net_hwtstamp_validate(ifr); if (err) @@ -317,23 +398,23 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) * Unknown or private ioctl */ default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) + return dev_siocdevprivate(dev, ifr, data, cmd); + + if (cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP) { + err = dev_eth_ioctl(dev, ifr, cmd); + } else if (cmd == SIOCBONDENSLAVE || cmd == SIOCBONDRELEASE || cmd == SIOCBONDSETHWADDR || cmd == SIOCBONDSLAVEINFOQUERY || cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCGHWTSTAMP || - cmd == SIOCWANDEV) { - err = dev_do_ioctl(dev, ifr, cmd); + cmd == SIOCBONDCHANGEACTIVE) { + err = dev_siocbond(dev, ifr, cmd); } else err = -EINVAL; @@ -386,7 +467,8 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) { int ret; char *colon; @@ -437,7 +519,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCETHTOOL: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ethtool(net, ifr); + ret = dev_ethtool(net, ifr, data); rtnl_unlock(); if (colon) *colon = ':'; @@ -456,7 +538,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (colon) *colon = ':'; @@ -502,7 +584,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (need_copyout) *need_copyout = false; @@ -527,7 +609,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); return ret; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 051432ea4f69..a856ae401ea5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,7 +92,8 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), }; -static LIST_HEAD(devlink_list); +static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); +#define DEVLINK_REGISTERED XA_MARK_1 /* devlink_mutex * @@ -108,23 +109,23 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); -static void __devlink_net_set(struct devlink *devlink, struct net *net) +static void devlink_put(struct devlink *devlink) { - write_pnet(&devlink->_net, net); + if (refcount_dec_and_test(&devlink->refcount)) + complete(&devlink->comp); } -void devlink_net_set(struct devlink *devlink, struct net *net) +static bool __must_check devlink_try_get(struct devlink *devlink) { - if (WARN_ON(devlink->registered)) - return; - __devlink_net_set(devlink, net); + return refcount_inc_not_zero(&devlink->refcount); } -EXPORT_SYMBOL_GPL(devlink_net_set); static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { struct devlink *devlink; + unsigned long index; + bool found = false; char *busname; char *devname; @@ -136,19 +137,19 @@ static struct devlink *devlink_get_from_attrs(struct net *net, lockdep_assert_held(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0 && - net_eq(devlink_net(devlink), net)) - return devlink; + net_eq(devlink_net(devlink), net)) { + found = true; + break; + } } - return ERR_PTR(-ENODEV); -} + if (!found || !devlink_try_get(devlink)) + devlink = ERR_PTR(-ENODEV); -static struct devlink *devlink_get_from_info(struct genl_info *info) -{ - return devlink_get_from_attrs(genl_info_net(info), info->attrs); + return devlink; } static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, @@ -190,6 +191,80 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, return devlink_port_get_from_attrs(devlink, info->attrs); } +static inline bool +devlink_rate_is_leaf(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; +} + +static inline bool +devlink_rate_is_node(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; +} + +static struct devlink_rate * +devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct devlink_rate *devlink_rate; + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); + if (IS_ERR(devlink_port)) + return ERR_CAST(devlink_port); + devlink_rate = devlink_port->devlink_rate; + return devlink_rate ?: ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) +{ + static struct devlink_rate *devlink_rate; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate) && + !strcmp(node_name, devlink_rate->name)) + return devlink_rate; + } + return ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + const char *rate_node_name; + size_t len; + + if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return ERR_PTR(-EINVAL); + rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); + len = strlen(rate_node_name); + /* Name cannot be empty or decimal number */ + if (!len || strspn(rate_node_name, "0123456789") == len) + return ERR_PTR(-EINVAL); + + return devlink_rate_node_get_by_name(devlink, rate_node_name); +} + +static struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_rate_node_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct nlattr **attrs = info->attrs; + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) + return devlink_rate_leaf_get_from_info(devlink, info); + else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return devlink_rate_node_get_from_info(devlink, info); + else + return ERR_PTR(-EINVAL); +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -408,12 +483,14 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) +#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) /* The per devlink instance lock is taken by default in the pre-doit * operation, yet several commands do not require this. The global * devlink lock is taken and protects from disruption by user-calls. */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(2) +#define DEVLINK_NL_FLAG_NO_LOCK BIT(4) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -423,7 +500,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, int err; mutex_lock(&devlink_mutex); - devlink = devlink_get_from_info(info); + devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); if (IS_ERR(devlink)) { mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); @@ -442,12 +519,31 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { + struct devlink_rate *devlink_rate; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) { + err = PTR_ERR(devlink_rate); + goto unlock; + } + info->user_ptr[1] = devlink_rate; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) { + err = PTR_ERR(rate_node); + goto unlock; + } + info->user_ptr[1] = rate_node; } return 0; unlock: if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return err; } @@ -460,6 +556,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, devlink = info->user_ptr[0]; if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); } @@ -723,10 +820,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } -static int -devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, bool *msg_updated) +static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; @@ -735,7 +833,8 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * if (!ops->port_function_hw_addr_get) return 0; - err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len, + extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -748,6 +847,55 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * return 0; } +static int devlink_nl_rate_fill(struct sk_buff *msg, + struct devlink_rate *devlink_rate, + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + struct devlink *devlink = devlink_rate->devlink; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) + goto nla_put_failure; + + if (devlink_rate_is_leaf(devlink_rate)) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + devlink_rate->devlink_port->index)) + goto nla_put_failure; + } else if (devlink_rate_is_node(devlink_rate)) { + if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, + devlink_rate->name)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, + devlink_rate->tx_share, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, + devlink_rate->tx_max, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (devlink_rate->parent) + if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, + devlink_rate->parent->name)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { @@ -762,12 +910,11 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } -static int -devlink_port_fn_state_fill(struct devlink *devlink, - const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) +static int devlink_port_fn_state_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; @@ -776,7 +923,7 @@ devlink_port_fn_state_fill(struct devlink *devlink, if (!ops->port_fn_state_get) return 0; - err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + err = ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -804,7 +951,6 @@ static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { - struct devlink *devlink = port->devlink; const struct devlink_ops *ops; struct nlattr *function_attr; bool msg_updated = false; @@ -814,13 +960,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por if (!function_attr) return -EMSGSIZE; - ops = devlink->ops; - err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, - extack, &msg_updated); + ops = port->devlink->ops; + err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack, + &msg_updated); if (err) goto out; - err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, - &msg_updated); + err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); @@ -829,12 +974,12 @@ out: return err; } -static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) { + struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -895,28 +1040,131 @@ nla_put_failure: static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { - struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; - if (!devlink_port->registered) + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) return; - WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(devlink_port->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static void devlink_rate_notify(struct devlink_rate *devlink_rate, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0, - NULL); + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(devlink_rate->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_rate *devlink_rate; + struct devlink *devlink; + int start = cb->args[0]; + unsigned long index; + int idx = 0; + int err = 0; + + mutex_lock(&devlink_mutex); + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; + + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, NULL); + if (err) { + mutex_unlock(&devlink->lock); + devlink_put(devlink); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); + } +out: + mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static bool +devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, + struct devlink_rate *parent) +{ + while (parent) { + if (parent == devlink_rate) + return true; + parent = parent->parent; + } + return false; } static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) @@ -944,20 +1192,30 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { + devlink_put(devlink); continue; + } + if (idx < start) { idx++; + devlink_put(devlink); continue; } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); + devlink_put(devlink); if (err) goto out; idx++; @@ -973,7 +1231,6 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -981,8 +1238,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_PORT_NEW, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { @@ -999,32 +1255,39 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_port *devlink_port; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; continue; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); + NLM_F_MULTI, cb->extack); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1033,31 +1296,33 @@ out: return msg->len; } -static int devlink_port_type_set(struct devlink *devlink, - struct devlink_port *devlink_port, +static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; - if (devlink->ops->port_type_set) { - if (port_type == devlink_port->type) - return 0; - err = devlink->ops->port_type_set(devlink_port, port_type); - if (err) - return err; - devlink_port->desired_type = port_type; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + if (!devlink_port->devlink->ops->port_type_set) + return -EOPNOTSUPP; + + if (port_type == devlink_port->type) return 0; - } - return -EOPNOTSUPP; + + err = devlink_port->devlink->ops->port_type_set(devlink_port, + port_type); + if (err) + return err; + + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; } -static int -devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_hw_addr_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { - const struct devlink_ops *ops; + const struct devlink_ops *ops = port->devlink->ops; const u8 *hw_addr; int hw_addr_len; @@ -1078,17 +1343,16 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * } } - ops = devlink->ops; if (!ops->port_function_hw_addr_set) { NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); return -EOPNOTSUPP; } - return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); + return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, + extack); } -static int devlink_port_fn_state_set(struct devlink *devlink, - struct devlink_port *port, +static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -1096,18 +1360,18 @@ static int devlink_port_fn_state_set(struct devlink *devlink, const struct devlink_ops *ops; state = nla_get_u8(attr); - ops = devlink->ops; + ops = port->devlink->ops; if (!ops->port_fn_state_set) { NL_SET_ERR_MSG_MOD(extack, "Function does not support state setting"); return -EOPNOTSUPP; } - return ops->port_fn_state_set(devlink, port, state, extack); + return ops->port_fn_state_set(port, state, extack); } -static int -devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; @@ -1121,7 +1385,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { - err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } @@ -1131,7 +1395,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) - err = devlink_port_fn_state_set(devlink, port, attr, extack); + err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); @@ -1142,14 +1406,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); - err = devlink_port_type_set(devlink, devlink_port, port_type); + err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } @@ -1158,7 +1421,7 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; - err = devlink_port_function_set(devlink, devlink_port, attr, extack); + err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } @@ -1253,9 +1516,8 @@ static int devlink_port_new_notifiy(struct devlink *devlink, goto out; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_NEW, info->snd_portid, - info->snd_seq, 0, NULL); + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0, NULL); if (err) goto out; @@ -1339,6 +1601,255 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, return devlink->ops->port_del(devlink, port_index, extack); } +static int +devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, + struct genl_info *info, + struct nlattr *nla_parent) +{ + struct devlink *devlink = devlink_rate->devlink; + const char *parent_name = nla_data(nla_parent); + const struct devlink_ops *ops = devlink->ops; + size_t len = strlen(parent_name); + struct devlink_rate *parent; + int err = -EOPNOTSUPP; + + parent = devlink_rate->parent; + if (parent && len) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent."); + return -EBUSY; + } else if (parent && !len) { + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + if (err) + return err; + + refcount_dec(&parent->refcnt); + devlink_rate->parent = NULL; + } else if (!parent && len) { + parent = devlink_rate_node_get_by_name(devlink, parent_name); + if (IS_ERR(parent)) + return -ENODEV; + + if (parent == devlink_rate) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed"); + return -EINVAL; + } + + if (devlink_rate_is_node(devlink_rate) && + devlink_rate_is_parent_node(devlink_rate, parent->parent)) { + NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node."); + return -EEXIST; + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + if (err) + return err; + + refcount_inc(&parent->refcnt); + devlink_rate->parent = parent; + } + + return 0; +} + +static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, + const struct devlink_ops *ops, + struct genl_info *info) +{ + struct nlattr *nla_parent, **attrs = info->attrs; + int err = -EOPNOTSUPP; + u64 rate; + + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_share = rate; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_max = rate; + } + + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; + if (nla_parent) { + err = devlink_nl_rate_parent_node_set(devlink_rate, info, + nla_parent); + if (err) + return err; + } + + return 0; +} + +static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, + struct genl_info *info, + enum devlink_rate_type type) +{ + struct nlattr **attrs = info->attrs; + + if (type == DEVLINK_RATE_TYPE_LEAF) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_leaf_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); + return false; + } + } else if (type == DEVLINK_RATE_TYPE_NODE) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { + NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_node_parent_set) { + NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); + return false; + } + } else { + WARN(1, "Unknown type of rate object"); + return false; + } + + return true; +} + +static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) + return -EOPNOTSUPP; + + err = devlink_nl_rate_set(devlink_rate, ops, info); + + if (!err) + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + return err; +} + +static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; + const struct devlink_ops *ops; + int err; + + ops = devlink->ops; + if (!ops || !ops->rate_node_new || !ops->rate_node_del) { + NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported"); + return -EOPNOTSUPP; + } + + if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) + return -EOPNOTSUPP; + + rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); + if (!IS_ERR(rate_node)) + return -EEXIST; + else if (rate_node == ERR_PTR(-EINVAL)) + return -EINVAL; + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return -ENOMEM; + + rate_node->devlink = devlink; + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); + if (!rate_node->name) { + err = -ENOMEM; + goto err_strdup; + } + + err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); + if (err) + goto err_node_new; + + err = devlink_nl_rate_set(rate_node, ops, info); + if (err) + goto err_rate_set; + + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return 0; + +err_rate_set: + ops->rate_node_del(rate_node, rate_node->priv, info->extack); +err_node_new: + kfree(rate_node->name); +err_strdup: + kfree(rate_node); + return err; +} + +static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_rate *rate_node = info->user_ptr[1]; + struct devlink *devlink = rate_node->devlink; + const struct devlink_ops *ops = devlink->ops; + int err; + + if (refcount_read(&rate_node->refcnt) > 1) { + NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node."); + return -EBUSY; + } + + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + if (rate_node->parent) + refcount_dec(&rate_node->parent->refcnt); + list_del(&rate_node->list); + kfree(rate_node->name); + kfree(rate_node); + return err; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -1410,13 +1921,18 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { @@ -1430,11 +1946,14 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1554,14 +2073,19 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, @@ -1572,10 +2096,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1767,14 +2294,19 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_port_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, @@ -1785,10 +2317,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2008,14 +2543,18 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_tc_pool_bind_get) - continue; + goto retry; mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { @@ -2028,10 +2567,13 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2207,6 +2749,23 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + struct devlink_rate *devlink_rate; + + /* Take the lock to sync with devlink_rate_nodes_destroy() */ + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) + if (devlink_rate_is_node(devlink_rate)) { + mutex_unlock(&devlink->lock); + NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); + return -EBUSY; + } + mutex_unlock(&devlink->lock); + return 0; +} + static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) { @@ -2221,6 +2780,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (!ops->eswitch_mode_set) return -EOPNOTSUPP; mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = devlink_rate_nodes_check(devlink, mode, info->extack); + if (err) + return err; err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; @@ -3283,10 +3845,12 @@ static void devlink_param_notify(struct devlink *devlink, struct devlink_param_item *param_item, enum devlink_command cmd); -static void devlink_reload_netns_change(struct devlink *devlink, - struct net *dest_net) +static void devlink_ns_change_notify(struct devlink *devlink, + struct net *dest_net, struct net *curr_net, + bool new) { struct devlink_param_item *param_item; + enum devlink_command cmd; /* Userspace needs to be notified about devlink objects * removed from original and entering new network namespace. @@ -3294,17 +3858,18 @@ static void devlink_reload_netns_change(struct devlink *devlink, * reload process so the notifications are generated separatelly. */ - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - devlink_notify(devlink, DEVLINK_CMD_DEL); + if (!dest_net || net_eq(dest_net, curr_net)) + return; - __devlink_net_set(devlink, dest_net); + if (new) + devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_notify(devlink, DEVLINK_CMD_NEW); + cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, cmd); + + if (!new) + devlink_notify(devlink, DEVLINK_CMD_DEL); } static bool devlink_reload_supported(const struct devlink_ops *ops) @@ -3384,6 +3949,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; int err; if (!devlink->reload_enabled) @@ -3391,18 +3957,22 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); + + curr_net = devlink_net(devlink); + devlink_ns_change_notify(devlink, dest_net, curr_net, false); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; - if (dest_net && !net_eq(dest_net, devlink_net(devlink))) - devlink_reload_netns_change(devlink, dest_net); + if (dest_net && !net_eq(dest_net, curr_net)) + write_pnet(&devlink->_net, dest_net); err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; + devlink_ns_change_notify(devlink, dest_net, curr_net, true); WARN_ON(!(*actions_performed & BIT(action))); /* Catch driver on updating the remote action within devlink reload */ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, @@ -3599,7 +4169,7 @@ out_free_msg: static void devlink_flash_update_begin_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, @@ -3608,7 +4178,7 @@ static void devlink_flash_update_begin_notify(struct devlink *devlink) static void devlink_flash_update_end_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, @@ -3765,6 +4335,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -4035,13 +4620,18 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink_param_item *param_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(param_item, &devlink->param_list, list) { if (idx < start) { @@ -4057,11 +4647,14 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -4303,13 +4896,18 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink_port *devlink_port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { list_for_each_entry(param_item, @@ -4329,12 +4927,15 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -4544,7 +5145,6 @@ static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { - struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); @@ -4553,8 +5153,9 @@ static void devlink_nl_region_notify(struct devlink_region *region, if (IS_ERR(msg)) return; - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(region->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -4872,15 +5473,22 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, &idx, start); +retry: + devlink_put(devlink); if (err) goto out; } @@ -5243,6 +5851,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return skb->len; @@ -5251,6 +5860,7 @@ nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: mutex_unlock(&devlink->lock); + devlink_put(devlink); out_dev: mutex_unlock(&devlink_mutex); return err; @@ -5397,22 +6007,20 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - continue; - if (idx < start) { - idx++; + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; - } - if (!devlink->ops->info_get) { - idx++; - continue; - } + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + + if (idx < start || !devlink->ops->info_get) + goto inc; mutex_lock(&devlink->lock); err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, @@ -5422,9 +6030,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, mutex_unlock(&devlink->lock); if (err == -EOPNOTSUPP) err = 0; - else if (err) + else if (err) { + devlink_put(devlink); break; + } +inc: idx++; +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); @@ -6238,11 +6851,11 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; @@ -6319,8 +6932,7 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, if (!msg) return; - err = devlink_nl_health_reporter_fill(msg, reporter->devlink, - reporter, cmd, 0, 0, 0); + err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; @@ -6510,6 +7122,7 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) goto unlock; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return reporter; unlock: @@ -6553,7 +7166,7 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, goto out; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); @@ -6576,13 +7189,18 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_rep; + mutex_lock(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) { @@ -6590,24 +7208,29 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, - reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI); if (err) { mutex_unlock(&devlink->reporters_lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->reporters_lock); +retry_rep: + devlink_put(devlink); } - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_port; + mutex_lock(&devlink->lock); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); @@ -6616,14 +7239,15 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; @@ -6631,6 +7255,8 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, mutex_unlock(&port->reporters_lock); } mutex_unlock(&devlink->lock); +retry_port: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -6994,8 +7620,9 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, } } -static int devlink_trap_stats_put(struct sk_buff *msg, - struct devlink_stats __percpu *trap_stats) +static int +devlink_trap_group_stats_put(struct sk_buff *msg, + struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; @@ -7023,6 +7650,50 @@ nla_put_failure: return -EMSGSIZE; } +static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item) +{ + struct devlink_stats stats; + struct nlattr *attr; + u64 drops = 0; + int err; + + if (devlink->ops->trap_drop_counter_get) { + err = devlink->ops->trap_drop_counter_get(devlink, + trap_item->trap, + &drops); + if (err) + return err; + } + + devlink_trap_stats_read(trap_item->stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (devlink->ops->trap_drop_counter_get && + nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + stats.rx_packets, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + stats.rx_bytes, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, @@ -7060,7 +7731,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = devlink_trap_stats_put(msg, trap_item->stats); + err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; @@ -7114,13 +7785,18 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, struct devlink_trap_item *trap_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < start) { @@ -7134,11 +7810,14 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7277,7 +7956,7 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, group_item->policer_item->policer->id)) goto nla_put_failure; - err = devlink_trap_stats_put(msg, group_item->stats); + err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; @@ -7333,13 +8012,18 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(group_item, &devlink->trap_group_list, list) { @@ -7354,11 +8038,14 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7639,13 +8326,18 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { @@ -7660,11 +8352,14 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7801,6 +8496,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 }, [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 }, [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7827,6 +8527,30 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { + .cmd = DEVLINK_CMD_RATE_GET, + .doit = devlink_nl_cmd_rate_get_doit, + .dumpit = devlink_nl_cmd_rate_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_RATE_SET, + .doit = devlink_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, + }, + { + .cmd = DEVLINK_CMD_RATE_NEW, + .doit = devlink_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_DEL, + .doit = devlink_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, + }, + { .cmd = DEVLINK_CMD_PORT_SPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, @@ -8176,31 +8900,46 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) } /** - * devlink_alloc - Allocate new devlink instance resources + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace * * @ops: ops * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ -struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) { struct devlink *devlink; + static u32 last_id; + int ret; - if (WARN_ON(!ops)) - return NULL; - + WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) { + kfree(devlink); + return NULL; + } + + devlink->dev = dev; devlink->ops = ops; xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - __devlink_net_set(devlink, &init_net); + write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); + INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); @@ -8212,22 +8951,22 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); + refcount_set(&devlink->refcount, 1); + init_completion(&devlink->comp); + return devlink; } -EXPORT_SYMBOL_GPL(devlink_alloc); +EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_register - Register devlink instance * * @devlink: devlink - * @dev: parent device */ -int devlink_register(struct devlink *devlink, struct device *dev) +int devlink_register(struct devlink *devlink) { - devlink->dev = dev; - devlink->registered = true; mutex_lock(&devlink_mutex); - list_add_tail(&devlink->list, &devlink_list); + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); return 0; @@ -8241,11 +8980,14 @@ EXPORT_SYMBOL_GPL(devlink_register); */ void devlink_unregister(struct devlink *devlink) { + devlink_put(devlink); + wait_for_completion(&devlink->comp); + mutex_lock(&devlink_mutex); WARN_ON(devlink_reload_supported(devlink->ops) && devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); - list_del(&devlink->list); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_unregister); @@ -8303,9 +9045,11 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); + xa_erase(&devlinks, devlink->index); kfree(devlink); } @@ -8366,9 +9110,10 @@ int devlink_port_register(struct devlink *devlink, mutex_unlock(&devlink->lock); return -EEXIST; } + + WARN_ON(devlink_port->devlink); devlink_port->devlink = devlink; devlink_port->index = port_index; - devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); @@ -8407,7 +9152,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->registered)) + if (WARN_ON(!devlink_port->devlink)) return; devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); @@ -8527,7 +9272,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); @@ -8551,7 +9296,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); @@ -8578,7 +9323,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); @@ -8606,7 +9351,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); @@ -8619,6 +9364,110 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); +/** + * devlink_rate_leaf_create - create devlink rate leaf + * + * @devlink_port: devlink port object to create rate object on + * @priv: driver private data + * + * Create devlink rate object of type leaf on provided @devlink_port. + * Throws call trace if @devlink_port already has a devlink rate object. + * + * Context: Takes and release devlink->lock <mutex>. + * + * Return: -ENOMEM if failed to allocate rate object, 0 otherwise. + */ +int +devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +{ + struct devlink *devlink = devlink_port->devlink; + struct devlink_rate *devlink_rate; + + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + if (!devlink_rate) + return -ENOMEM; + + mutex_lock(&devlink->lock); + WARN_ON(devlink_port->devlink_rate); + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; + devlink_rate->devlink = devlink; + devlink_rate->devlink_port = devlink_port; + devlink_rate->priv = priv; + list_add_tail(&devlink_rate->list, &devlink->rate_list); + devlink_port->devlink_rate = devlink_rate; + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + mutex_unlock(&devlink->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_create); + +/** + * devlink_rate_leaf_destroy - destroy devlink rate leaf + * + * @devlink_port: devlink port linked to the rate object + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) +{ + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; + struct devlink *devlink = devlink_port->devlink; + + if (!devlink_rate) + return; + + mutex_lock(&devlink->lock); + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + if (devlink_rate->parent) + refcount_dec(&devlink_rate->parent->refcnt); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + mutex_unlock(&devlink->lock); + kfree(devlink_rate); +} +EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); + +/** + * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device + * + * @devlink: devlink instance + * + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_rate_nodes_destroy(struct devlink *devlink) +{ + static struct devlink_rate *devlink_rate, *tmp; + const struct devlink_ops *ops = devlink->ops; + + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (!devlink_rate->parent) + continue; + + refcount_dec(&devlink_rate->parent->refcnt); + if (devlink_rate_is_leaf(devlink_rate)) + ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + else if (devlink_rate_is_node(devlink_rate)) + ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + } + list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate)) { + ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); + list_del(&devlink_rate->list); + kfree(devlink_rate->name); + kfree(devlink_rate); + } + } + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy); + static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { @@ -8630,12 +9479,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: - if (!attrs->split) - n = snprintf(name, len, "p%u", attrs->phys.port_number); - else - n = snprintf(name, len, "p%us%u", - attrs->phys.port_number, - attrs->phys.split_subport_number); + n = snprintf(name, len, "p%u", attrs->phys.port_number); + if (n < len && attrs->split) + n += snprintf(name + n, len - n, "s%u", + attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -9092,6 +9939,22 @@ static int devlink_param_verify(const struct devlink_param *param) return devlink_param_driver_verify(param); } +static int __devlink_param_register_one(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *param, + enum devlink_command reg_cmd) +{ + int err; + + err = devlink_param_verify(param); + if (err) + return err; + + return devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); +} + static int __devlink_params_register(struct devlink *devlink, unsigned int port_index, struct list_head *param_list, @@ -9106,12 +9969,8 @@ static int __devlink_params_register(struct devlink *devlink, mutex_lock(&devlink->lock); for (i = 0; i < params_count; i++, param++) { - err = devlink_param_verify(param); - if (err) - goto rollback; - - err = devlink_param_register_one(devlink, port_index, - param_list, param, reg_cmd); + err = __devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); if (err) goto rollback; } @@ -9184,6 +10043,43 @@ void devlink_params_unregister(struct devlink *devlink, EXPORT_SYMBOL_GPL(devlink_params_unregister); /** + * devlink_param_register - register one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Register the configuration parameter supported by the driver. + * Return: returns 0 on successful registration or error code otherwise. + */ +int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) +{ + int err; + + mutex_lock(&devlink->lock); + err = __devlink_param_register_one(devlink, 0, &devlink->param_list, + param, DEVLINK_CMD_PARAM_NEW); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_param_register); + +/** + * devlink_param_unregister - unregister one configuration parameter + * @devlink: devlink + * @param: configuration parameter to unregister + */ +void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) +{ + mutex_lock(&devlink->lock); + devlink_param_unregister_one(devlink, 0, &devlink->param_list, param, + DEVLINK_CMD_PARAM_DEL); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_param_unregister); + +/** * devlink_params_publish - publish configuration parameters * * @devlink: devlink @@ -9226,6 +10122,54 @@ void devlink_params_unpublish(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_params_unpublish); /** + * devlink_param_publish - publish one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Publish previously registered configuration parameter. + */ +void devlink_param_publish(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, &devlink->param_list, list) { + if (param_item->param != param || param_item->published) + continue; + param_item->published = true; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); + break; + } +} +EXPORT_SYMBOL_GPL(devlink_param_publish); + +/** + * devlink_param_unpublish - unpublish one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Unpublish previously registered configuration parameter. + */ +void devlink_param_unpublish(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, &devlink->param_list, list) { + if (param_item->param != param || !param_item->published) + continue; + param_item->published = false; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + break; + } +} +EXPORT_SYMBOL_GPL(devlink_param_unpublish); + +/** * devlink_port_params_register - register port configuration parameters * * @devlink_port: devlink port @@ -10580,23 +11524,29 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; + unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (net_eq(devlink_net(devlink), net)) { - if (WARN_ON(!devlink_reload_supported(devlink->ops))) - continue; - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - } + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), net)) + goto retry; + + WARN_ON(!devlink_reload_supported(devlink->ops)); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); } diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ead2a8aa57b4..49442cae6f69 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -850,8 +850,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) } hw_metadata->input_dev = metadata->input_dev; - if (hw_metadata->input_dev) - dev_hold(hw_metadata->input_dev); + dev_hold(hw_metadata->input_dev); return hw_metadata; @@ -867,8 +866,7 @@ free_hw_metadata: static void net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) { - if (hw_metadata->input_dev) - dev_put(hw_metadata->input_dev); + dev_put(hw_metadata->input_dev); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); diff --git a/net/core/dst.c b/net/core/dst.c index fb3bcba87744..497ef9b3fc6a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,8 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -118,8 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - if (dst->dev) - dev_put(dst->dev); + dev_put(dst->dev); lwtstate_put(dst->lwtstate); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a9f937975080..79df7cd9dbc1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -57,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; @@ -541,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index 65ab4e21c087..2e32cee2c469 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -17,6 +17,7 @@ * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ +#include <linux/atomic.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -41,7 +42,6 @@ #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/unaligned.h> -#include <asm/cmpxchg.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> @@ -77,6 +77,7 @@ #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> +#include <net/xdp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -113,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to BPF_PROG_RUN. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -2179,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -2213,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } rcu_read_unlock_bh(); if (dst) - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; @@ -2286,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -3241,9 +3225,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; @@ -3255,19 +3236,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV4 needs to be changed into - * SKB_GSO_TCPV6. - */ + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } - - /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); @@ -3282,9 +3255,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; - ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; @@ -3296,19 +3266,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* SKB_GSO_TCPV6 needs to be changed into - * SKB_GSO_TCPV4. - */ + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } - - /* Due to IPv4 header, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); @@ -3902,8 +3864,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; - if (unlikely((metalen & (sizeof(__u32) - 1)) || - (metalen > 32))) + if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; @@ -3919,6 +3880,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ void xdp_do_flush(void) { __dev_flush(); @@ -3927,6 +3916,48 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); +void bpf_clear_redirect_map(struct bpf_map *map) +{ + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } +} + +DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct net_device *master, *slave; + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -3934,6 +3965,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ @@ -3943,7 +3975,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_enqueue(fwd, xdp, dev); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_enqueue_multi(xdp, dev, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_enqueue(fwd, xdp, dev); + } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdp, dev); @@ -3985,13 +4024,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - err = dev_map_generic_redirect(fwd, skb, xdp_prog); + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_generic_redirect(fwd, skb, xdp_prog); + } if (unlikely(err)) goto err; break; @@ -4001,8 +4048,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, goto err; consume_skb(skb); break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; default: - /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } @@ -4625,6 +4676,30 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -4973,6 +5048,46 @@ err_clear: return -EINVAL; } +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_TCP && optname == TCP_CONGESTION) { + if (optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, optlen)) + return -ENOTSUPP; + } + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { @@ -7406,6 +7521,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; @@ -7452,6 +7569,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -10008,11 +10127,13 @@ out: static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; @@ -10021,13 +10142,14 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; - bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); - action = BPF_PROG_RUN(prog, &reuse_kern); + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); + action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; @@ -10136,6 +10258,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; default: return bpf_base_func_proto(func_id); } @@ -10165,6 +10289,14 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, hash): return size == size_default; + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) @@ -10237,6 +10369,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3ed7c98a98e1..bac0184cf3de 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -943,8 +943,8 @@ bool __skb_flow_dissect(const struct net *net, int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; - /* Tail taggers don't break flow dissection */ - if (!ops->tail_tag) { + /* Only DSA header taggers break flow dissection */ + if (ops->needed_headroom) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else @@ -1056,8 +1056,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } @@ -1101,8 +1103,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &iph->saddr, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -1504,7 +1508,7 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); -/* Sort the source and destination IP (and the ports if the IP are the same), +/* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) @@ -1515,11 +1519,11 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) case FLOW_DISSECTOR_KEY_IPV4_ADDRS: addr_diff = (__force u32)keys->addrs.v4addrs.dst - (__force u32)keys->addrs.v4addrs.src; - if ((addr_diff < 0) || - (addr_diff == 0 && - ((__force u16)keys->ports.dst < - (__force u16)keys->ports.src))) { + if (addr_diff < 0) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; @@ -1527,13 +1531,13 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); - if ((addr_diff < 0) || - (addr_diff == 0 && - ((__force u16)keys->ports.dst < - (__force u16)keys->ports.src))) { + if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); + } + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 715b67f6c62f..6beaea13564a 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -321,13 +321,13 @@ EXPORT_SYMBOL(flow_block_cb_setup_simple); static DEFINE_MUTEX(flow_indr_block_lock); static LIST_HEAD(flow_block_indr_list); static LIST_HEAD(flow_block_indr_dev_list); +static LIST_HEAD(flow_indir_dev_list); struct flow_indr_dev { struct list_head list; flow_indr_block_bind_cb_t *cb; void *cb_priv; refcount_t refcnt; - struct rcu_head rcu; }; static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, @@ -346,6 +346,33 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, return indr_dev; } +struct flow_indir_dev_info { + void *data; + struct net_device *dev; + struct Qdisc *sch; + enum tc_setup_type type; + void (*cleanup)(struct flow_block_cb *block_cb); + struct list_head list; + enum flow_block_command command; + enum flow_block_binder_type binder_type; + struct list_head *cb_list; +}; + +static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_block_offload bo; + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + memset(&bo, 0, sizeof(bo)); + bo.command = cur->command; + bo.binder_type = cur->binder_type; + INIT_LIST_HEAD(&bo.cb_list); + cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); + list_splice(&bo.cb_list, cur->cb_list); + } +} + int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_indr_dev *indr_dev; @@ -367,6 +394,7 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) } list_add(&indr_dev->list, &flow_block_indr_dev_list); + existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); return 0; @@ -463,7 +491,59 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, +static struct flow_indir_dev_info *find_indir_dev(void *data) +{ + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + if (cur->data == data) + return cur; + } + return NULL; +} + +static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), + struct flow_block_offload *bo) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (info) + return -EEXIST; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->data = data; + info->dev = dev; + info->sch = sch; + info->type = type; + info->cleanup = cleanup; + info->command = bo->command; + info->binder_type = bo->binder_type; + info->cb_list = bo->cb_list_head; + + list_add(&info->list, &flow_indir_dev_list); + return 0; +} + +static int indir_dev_remove(void *data) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (!info) + return -ENOENT; + + list_del(&info->list); + + kfree(info); + return 0; +} + +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) @@ -471,6 +551,12 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, struct flow_indr_dev *this; mutex_lock(&flow_indr_block_lock); + + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + list_for_each_entry(this, &flow_block_indr_dev_list, list) this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 75431ca9300f..1a455847da54 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -158,7 +158,7 @@ static void linkwatch_do_dev(struct net_device *dev) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); - if (dev->flags & IFF_UP && netif_device_present(dev)) { + if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else @@ -204,7 +204,8 @@ static void __linkwatch_run_queue(int urgent_only) dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); - if (urgent_only && !linkwatch_urgent_event(dev)) { + if (!netif_device_present(dev) || + (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 8ec7d13d2860..2820aca2173a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,9 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); + #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) @@ -43,6 +46,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bf774575ad71..2d5bc3a75fae 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -741,12 +741,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); if (tbl->pconstructor && tbl->pconstructor(n)) { - if (dev) - dev_put(dev); + dev_put(dev); kfree(n); n = NULL; goto out; @@ -778,8 +776,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); return 0; } @@ -812,8 +809,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); } return -ENOENT; @@ -1662,8 +1658,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); + dev_put(parms->dev); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -2533,6 +2528,13 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) return false; master = dev ? netdev_master_upper_dev_get(dev) : NULL; + + /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another + * invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -3142,7 +3144,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; - int bucket = state->bucket; + int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { @@ -3315,12 +3317,13 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); + seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } - seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx %08lx\n", + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx " + "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..eab5fc88a002 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " - "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu " + "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); + seq_puts(seq, "Interface| Receive " + " | Transmit\n" + " | bytes packets errs drop fifo frame " + "compressed multicast| bytes packets errs " + " drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; @@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) struct packet_type *pt = v; if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); + seq_puts(seq, "Type Device Function\n"); else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s %ps\n", + seq_printf(seq, " %-9s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -327,12 +327,14 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct netdev_hw_addr *ha; struct net_device *dev = v; - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n"); return 0; + } netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 283ddb2dbc7d..c40cd8dd75c7 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9b5a767eddd5..a448a9b5bb2d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -98,7 +98,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) } ng = net_alloc_generic(); - if (ng == NULL) + if (!ng) return -ENOMEM; /* @@ -148,13 +148,6 @@ out: return err; } -static void ops_free(const struct pernet_operations *ops, struct net *net) -{ - if (ops->id && ops->size) { - kfree(net_generic(net, *ops->id)); - } -} - static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -184,7 +177,7 @@ static void ops_free_list(const struct pernet_operations *ops, struct net *net; if (ops->size && ops->id) { list_for_each_entry(net, net_exit_list, exit_list) - ops_free(ops, net); + kfree(net_generic(net, *ops->id)); } } @@ -433,15 +426,18 @@ out_free: static void net_free(struct net *net) { - kfree(rcu_access_pointer(net->gen)); - kmem_cache_free(net_cachep, net); + if (refcount_dec_and_test(&net->passive)) { + kfree(rcu_access_pointer(net->gen)); + kmem_cache_free(net_cachep, net); + } } void net_drop_ns(void *p) { - struct net *ns = p; - if (ns && refcount_dec_and_test(&ns->passive)) - net_free(ns); + struct net *net = (struct net *)p; + + if (net) + net_free(net); } struct net *copy_net_ns(unsigned long flags, @@ -479,7 +475,7 @@ struct net *copy_net_ns(unsigned long flags, put_userns: key_remove_domain(net->key_domain); put_user_ns(user_ns); - net_drop_ns(net); + net_free(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -611,7 +607,7 @@ static void cleanup_net(struct work_struct *work) dec_net_namespaces(net->ucounts); key_remove_domain(net->key_domain); put_user_ns(net->user_ns); - net_drop_ns(net); + net_free(net); } } @@ -1120,6 +1116,14 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); +static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) +{ + ops_pre_exit_list(ops, net_exit_list); + synchronize_rcu(); + ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); +} + #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) @@ -1145,10 +1149,7 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); return error; } @@ -1161,10 +1162,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + + free_exit_list(ops, &net_exit_list); } #else @@ -1187,10 +1186,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); } } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c310c7c1cef7..edfc0f8011f8 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -36,6 +36,7 @@ #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> +#include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -389,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; - WARN_ON_ONCE(!irqs_disabled()); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) @@ -428,7 +430,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) ip6h = ipv6_hdr(skb); /* ip6h->version = 6; ip6h->priority = 0; */ - put_unaligned(0x60, (unsigned char *)ip6h); + *(unsigned char *)ip6h = 0x60; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; @@ -456,7 +458,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ - put_unaligned(0x45, (unsigned char *)iph); + *(unsigned char *)iph = 0x45; iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); iph->id = htons(atomic_inc_return(&ip_ident)); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3c4c4c7a0402..1a6978427d6c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -17,12 +17,15 @@ #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for __put_page() */ +#include <linux/poison.h> #include <trace/events/page_pool.h> #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -66,6 +69,10 @@ static int page_pool_init(struct page_pool *pool, */ } + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -205,6 +212,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -221,6 +241,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + page_pool_set_pp_info(pool, page); + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); @@ -263,6 +285,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -341,10 +365,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page_pool_clear_pp_info(page); + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } EXPORT_SYMBOL(page_pool_release_page); @@ -399,6 +425,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -491,6 +522,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk); +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -596,6 +705,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_free_frag(pool); + if (!page_pool_release(pool)) return; @@ -622,3 +733,32 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 3fba429f1f57..a3d74e2704c4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -175,6 +175,9 @@ #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) +/* Max number of internet mix entries that can be specified in imix_weights. */ +#define MAX_IMIX_ENTRIES 20 +#define IMIX_PRECISION 100 /* Precision of IMIX distribution */ #define func_enter() pr_debug("entering %s\n", __func__); @@ -242,6 +245,12 @@ static char *pkt_flag_names[] = { #define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) #define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) +struct imix_pkt { + u64 size; + u64 weight; + u64 count_so_far; +}; + struct flow_state { __be32 cur_daddr; int count; @@ -343,6 +352,12 @@ struct pktgen_dev { __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + /* IMIX */ + unsigned int n_imix_entries; + struct imix_pkt imix_entries[MAX_IMIX_ENTRIES]; + /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/ + __u8 imix_distribution[IMIX_PRECISION]; + /* MPLS */ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */ __be32 labels[MAX_MPLS_LABELS]; @@ -467,10 +482,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(struct pktgen_net *pn); static void pktgen_reset_all_threads(struct pktgen_net *pn); -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn); +static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static void fill_imix_distribution(struct pktgen_dev *pkt_dev); /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; @@ -516,14 +532,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ if (!strcmp(data, "stop")) - pktgen_stop_all_threads_ifs(pn); - + pktgen_stop_all_threads(pn); else if (!strcmp(data, "start")) pktgen_run_all_threads(pn); - else if (!strcmp(data, "reset")) pktgen_reset_all_threads(pn); - else return -EINVAL; @@ -555,6 +568,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + if (pkt_dev->n_imix_entries > 0) { + seq_puts(seq, " imix_weights: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].weight); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " frags: %d delay: %llu clone_skb: %d ifname: %s\n", pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, @@ -672,6 +695,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->sofar, (unsigned long long)pkt_dev->errors); + if (pkt_dev->n_imix_entries > 0) { + int i; + + seq_puts(seq, " imix_size_counts: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].count_so_far); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " started: %lluus stopped: %lluus idle: %lluus\n", (unsigned long long) ktime_to_us(pkt_dev->started_at), @@ -795,6 +830,62 @@ done_str: return i; } +/* Parses imix entries from user buffer. + * The user buffer should consist of imix entries separated by spaces + * where each entry consists of size and weight delimited by commas. + * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. + */ +static ssize_t get_imix_entries(const char __user *buffer, + struct pktgen_dev *pkt_dev) +{ + const int max_digits = 10; + int i = 0; + long len; + char c; + + pkt_dev->n_imix_entries = 0; + + do { + unsigned long weight; + unsigned long size; + + len = num_arg(&buffer[i], max_digits, &size); + if (len < 0) + return len; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + /* Check for comma between size_i and weight_i */ + if (c != ',') + return -EINVAL; + i++; + + if (size < 14 + 20 + 8) + size = 14 + 20 + 8; + + len = num_arg(&buffer[i], max_digits, &weight); + if (len < 0) + return len; + if (weight <= 0) + return -EINVAL; + + pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size; + pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; + + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + + i++; + pkt_dev->n_imix_entries++; + + if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) + return -E2BIG; + } while (c == ' '); + + return i; +} + static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) { unsigned int n = 0; @@ -963,6 +1054,20 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "imix_weights")) { + if (pkt_dev->clone_skb > 0) + return -EINVAL; + + len = get_imix_entries(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + + fill_imix_distribution(pkt_dev); + + i += len; + return count; + } + if (!strcmp(name, "debug")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1085,10 +1190,16 @@ static ssize_t pktgen_if_write(struct file *file, len = num_arg(&user_buffer[i], 10, &value); if (len < 0) return len; + /* clone_skb is not supported for netif_receive xmit_mode and + * IMIX mode. + */ if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; + if (value > 0 && pkt_dev->n_imix_entries > 0) + return -EINVAL; + i += len; pkt_dev->clone_skb = value; @@ -1193,11 +1304,6 @@ static ssize_t pktgen_if_write(struct file *file, * pktgen_xmit() is called */ pkt_dev->last_ok = 1; - - /* override clone_skb if user passed default value - * at module loading time - */ - pkt_dev->clone_skb = 0; } else if (strcmp(f, "queue_xmit") == 0) { pkt_dev->xmit_mode = M_QUEUE_XMIT; pkt_dev->last_ok = 1; @@ -2480,6 +2586,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) t = pkt_dev->min_pkt_size; } pkt_dev->cur_pkt_size = t; + } else if (pkt_dev->n_imix_entries > 0) { + struct imix_pkt *entry; + __u32 t = prandom_u32() % IMIX_PRECISION; + __u8 entry_index = pkt_dev->imix_distribution[t]; + + entry = &pkt_dev->imix_entries[entry_index]; + entry->count_so_far++; + pkt_dev->cur_pkt_size = entry->size; } set_cur_queue_map(pkt_dev); @@ -2487,6 +2601,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].count++; } +static void fill_imix_distribution(struct pktgen_dev *pkt_dev) +{ + int cumulative_probabilites[MAX_IMIX_ENTRIES]; + int j = 0; + __u64 cumulative_prob = 0; + __u64 total_weight = 0; + int i = 0; + + for (i = 0; i < pkt_dev->n_imix_entries; i++) + total_weight += pkt_dev->imix_entries[i].weight; + + /* Fill cumulative_probabilites with sum of normalized probabilities */ + for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) { + cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight * + IMIX_PRECISION, + total_weight); + cumulative_probabilites[i] = cumulative_prob; + } + cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100; + + for (i = 0; i < IMIX_PRECISION; i++) { + if (i == cumulative_probabilites[j]) + j++; + pkt_dev->imix_distribution[i] = j; + } +} #ifdef CONFIG_XFRM static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { @@ -3027,20 +3167,25 @@ static void pktgen_run(struct pktgen_thread *t) t->control &= ~(T_STOP); } -static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn) +static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags) { struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= T_STOP; + t->control |= (flags); mutex_unlock(&pktgen_thread_lock); } +static void pktgen_stop_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_STOP); +} + static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; @@ -3103,16 +3248,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) static void pktgen_run_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_RUN); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_RUN); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3122,16 +3260,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn) static void pktgen_reset_all_threads(struct pktgen_net *pn) { - struct pktgen_thread *t; - func_enter(); - mutex_lock(&pktgen_thread_lock); - - list_for_each_entry(t, &pn->pktgen_threads, th_list) - t->control |= (T_REMDEVALL); - - mutex_unlock(&pktgen_thread_lock); + pktgen_handle_all_threads(pn, T_REMDEVALL); /* Propagate thread->control */ schedule_timeout_interruptible(msecs_to_jiffies(125)); @@ -3157,7 +3288,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, ktime_to_ns(elapsed)); - bps = pps * 8 * pkt_dev->cur_pkt_size; + if (pkt_dev->n_imix_entries > 0) { + int i; + struct imix_pkt *entry; + + bps = 0; + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + entry = &pkt_dev->imix_entries[i]; + bps += entry->size * entry->count_so_far; + } + bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed)); + } else { + bps = pps * 8 * pkt_dev->cur_pkt_size; + } mbps = bps; do_div(mbps, 1000000); @@ -3459,7 +3602,6 @@ out: static int pktgen_thread_worker(void *arg) { - DEFINE_WAIT(wait); struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index e33fde06d528..dd4cf01d1e0a 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return BPF_PROG_RUN(ptp_insns, skb); + return bpf_prog_run(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ec931b080156..972c8cb303a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -9,7 +9,7 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: - * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> @@ -234,7 +234,7 @@ unlock: * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module); * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the @@ -376,12 +376,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - /* The check for setup is here because if ops + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ - if (ops->setup && !ops->dellink) + if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -543,7 +543,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; - list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + ASSERT_RTNL(); + + list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } @@ -708,15 +710,8 @@ out: int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; - int err = 0; - NETLINK_CB(skb).dst_group = group; - if (echo) - refcount_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) @@ -731,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; - int report = 0; - if (nlh) - report = nlmsg_report(nlh); - - nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); @@ -1819,6 +1810,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; + if (dev->dev.parent && + nla_put_string(skb, IFLA_PARENT_DEV_NAME, + dev_name(dev->dev.parent))) + goto nla_put_failure; + + if (dev->dev.parent && dev->dev.parent->bus && + nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, + dev->dev.parent->bus->name)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1878,6 +1879,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1957,6 +1959,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx) return false; master = netdev_master_upper_dev_get(dev); + + /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need + * another invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -2255,7 +2264,8 @@ invalid_attr: return -EINVAL; } -static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) { if (dev) { if (tb[IFLA_ADDRESS] && @@ -2274,27 +2284,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); af_ops = rtnl_af_lookup(nla_type(af)); - if (!af_ops) { - rcu_read_unlock(); + if (!af_ops) return -EAFNOSUPPORT; - } - if (!af_ops->set_link_af) { - rcu_read_unlock(); + if (!af_ops->set_link_af) return -EOPNOTSUPP; - } if (af_ops->validate_link_af) { - err = af_ops->validate_link_af(dev, af); - if (err < 0) { - rcu_read_unlock(); + err = af_ops->validate_link_af(dev, af, extack); + if (err < 0) return err; - } } - - rcu_read_unlock(); } } @@ -2574,7 +2575,7 @@ static int do_set_proto_down(struct net_device *dev, if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); - /* Dont turn off protodown if there are active reasons */ + /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; @@ -2599,11 +2600,12 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + const char *pat = ifname && ifname[0] ? ifname : NULL; struct net *net; int new_ifindex; @@ -2619,7 +2621,7 @@ static int do_setlink(const struct sk_buff *skb, else new_ifindex = 0; - err = __dev_change_net_namespace(dev, net, ifname, new_ifindex); + err = __dev_change_net_namespace(dev, net, pat, new_ifindex); put_net(net); if (err) goto errout; @@ -2868,17 +2870,12 @@ static int do_setlink(const struct sk_buff *skb, nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - rcu_read_lock(); - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); - if (err < 0) { - rcu_read_unlock(); + if (err < 0) goto errout; - } - rcu_read_unlock(); status |= DO_SETLINK_NOTIFY; } } @@ -3177,8 +3174,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, return ERR_PTR(-EINVAL); } - dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, - ops->setup, num_tx_queues, num_rx_queues); + if (ops->alloc) { + dev = ops->alloc(tb, ifname, name_assign_type, + num_tx_queues, num_rx_queues); + if (IS_ERR(dev)) + return dev; + } else { + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + } + if (!dev) return ERR_PTR(-ENOMEM); @@ -3293,7 +3299,7 @@ replay: m_ops = master_dev->rtnl_link_ops; } - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; @@ -3411,7 +3417,7 @@ replay: return -EOPNOTSUPP; } - if (!ops->setup) + if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (!ifname[0]) { @@ -3939,12 +3945,12 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { - pr_info("%s: FDB only supports static addresses\n", dev->name); + netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (vid) { - pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name); + netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } @@ -4078,7 +4084,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { - pr_info("%s: FDB only supports static addresses\n", dev->name); + netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } diff --git a/net/core/scm.c b/net/core/scm.c index ae3085d9aae8..5c356f0dee30 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -355,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); diff --git a/net/core/selftests.c b/net/core/selftests.c index ba7b0171974c..9077fa969892 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -318,6 +318,15 @@ static int net_test_phy_loopback_udp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + static int net_test_phy_loopback_tcp(struct net_device *ndev) { struct net_packet_attrs attr = { }; @@ -345,6 +354,9 @@ static const struct net_test { .name = "PHY internal loopback, UDP ", .fn = net_test_phy_loopback_udp, }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bbc3b4b62032..2170bea2c7de 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> +#include <net/page_pool.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -155,7 +156,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) void *data; fragsz = SKB_DATA_ALIGN(fragsz); - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { @@ -501,7 +502,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; @@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; - if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } } static void skb_release_data(struct sk_buff *skb) @@ -659,17 +663,28 @@ static void skb_release_data(struct sk_buff *skb) if (skb->cloned && atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &shinfo->dataref)) - return; + goto exit; skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; } /* @@ -709,7 +724,7 @@ void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { - WARN_ON(in_irq()); + WARN_ON(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -939,8 +954,13 @@ void __kfree_skb_defer(struct sk_buff *skb) void napi_skb_free_stolen_head(struct sk_buff *skb) { - skb_dst_drop(skb); - skb_ext_put(skb); + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } napi_skb_cache_put(skb); } @@ -1046,6 +1066,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -1289,7 +1310,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) } spin_unlock_irqrestore(&q->lock, flags); - sk->sk_error_report(sk); + sk_error_report(sk); release: consume_skb(skb); @@ -1769,6 +1790,48 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) EXPORT_SYMBOL(skb_realloc_headroom); /** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + /* pskb_expand_head() might crash, if skb is shared */ + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (likely(nskb)) { + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + consume_skb(skb); + } else { + kfree_skb(skb); + } + skb = nskb; + } + if (skb && + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(skb); + skb = NULL; + } + return skb; +} +EXPORT_SYMBOL(skb_expand_head); + +/** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head @@ -3005,8 +3068,11 @@ skb_zerocopy_headlen(const struct sk_buff *from) if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + } if (skb_has_frag_list(from)) hlen = from->len; @@ -3497,7 +3563,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -3818,7 +3884,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); - __copy_skb_header(nskb, skb); + __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); skb_copy_from_linear_data_offset(skb, -tnl_hlen, @@ -3869,6 +3935,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->last = skb; NAPI_GRO_CB(p)->count++; p->data_len += skb->len; + + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; p->truesize += skb->truesize; p->len += skb->len; @@ -4236,6 +4305,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int new_truesize; struct sk_buff *lp; if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) @@ -4267,10 +4337,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ - delta_truesize = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; - skb->truesize -= skb->data_len; + skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; @@ -4299,12 +4369,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ - delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -4680,7 +4754,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -4711,7 +4785,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) sk->sk_err = 0; if (skb_next) - sk->sk_error_report(sk); + sk_error_report(sk); return skb; } @@ -5287,6 +5361,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; + /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); @@ -6422,6 +6503,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) new->chunks = newlen; new->offset[id] = newoff; set_active: + skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..2d6249b28928 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,29 +399,6 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_wait_data); - /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) @@ -531,10 +508,8 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, if (skb_linearize(skb)) return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); - if (unlikely(num_sge < 0)) { - kfree(msg); + if (unlikely(num_sge < 0)) return num_sge; - } copied = skb->len; msg->sg.start = 0; @@ -553,6 +528,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) { struct sock *sk = psock->sk; struct sk_msg *msg; + int err; /* If we are receiving on the same sock skb->sk is already assigned, * skip memory accounting and owner transition seeing it already set @@ -571,7 +547,10 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) * into user buffers. */ skb_set_owner_r(skb, sk); - return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + if (err < 0) + kfree(msg); + return err; } /* Puts an skb on the ingress queue of the socket already assigned to the @@ -582,12 +561,16 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb { struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); struct sock *sk = psock->sk; + int err; if (unlikely(!msg)) return -EAGAIN; sk_msg_init(msg); skb_set_owner_r(skb, sk); - return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + if (err < 0) + kfree(msg); + return err; } static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, @@ -601,23 +584,42 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } +static void sk_psock_skb_state(struct sk_psock *psock, + struct sk_psock_work_state *state, + struct sk_buff *skb, + int len, int off) +{ + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + state->skb = skb; + state->len = len; + state->off = off; + } else { + sock_drop(psock->sk, skb); + } + spin_unlock_bh(&psock->ingress_lock); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; - struct sk_buff *skb; + struct sk_buff *skb = NULL; bool ingress; u32 len, off; int ret; mutex_lock(&psock->work_mutex); - if (state->skb) { + if (unlikely(state->skb)) { + spin_lock_bh(&psock->ingress_lock); skb = state->skb; len = state->len; off = state->off; state->skb = NULL; - goto start; + spin_unlock_bh(&psock->ingress_lock); } + if (skb) + goto start; while ((skb = skb_dequeue(&psock->ingress_skb))) { len = skb->len; @@ -632,15 +634,14 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - state->skb = skb; - state->len = len; - state->off = off; + sk_psock_skb_state(psock, state, skb, + len, off); goto end; } /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); + sock_drop(psock->sk, skb); goto end; } off += ret; @@ -731,8 +732,13 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(psock->sk, skb); } + kfree_skb(psock->work_state.skb); + /* We null the skb here to ensure that calls to sk_psock_backlog + * do not pick up the free'd skb. + */ + psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } @@ -784,8 +790,6 @@ static void sk_psock_destroy(struct work_struct *work) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_stop(psock, false); - write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); @@ -795,6 +799,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); + sk_psock_stop(psock, false); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); } @@ -847,7 +853,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static void sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -857,8 +863,8 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { - kfree_skb(skb); - return; + sock_drop(from->sk, skb); + return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another @@ -866,26 +872,30 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { - kfree_skb(skb); - return; + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); - kfree_skb(skb); - return; + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); spin_unlock_bh(&psock_other->ingress_lock); + return 0; } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: @@ -909,20 +919,21 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, psock->sk, ret); + sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) { struct sock *sk_other; - int err = -EIO; + int err = 0; switch (verdict) { case __SK_PASS: + err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { @@ -945,18 +956,25 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: - kfree_skb(skb); + sock_drop(psock->sk, skb); } + + return err; } static void sk_psock_write_space(struct sock *sk) @@ -988,7 +1006,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1109,7 +1127,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1123,7 +1141,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_verdict_apply(psock, skb, ret); + if (sk_psock_verdict_apply(psock, skb, ret) < 0) + len = 0; out: rcu_read_unlock(); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 946888afef88..62627e868e03 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include <net/tcp.h> #include <net/busy_poll.h> +#include <linux/ethtool.h> + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -224,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { @@ -331,6 +334,22 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(__sk_backlog_rcv); +void sk_error_report(struct sock *sk) +{ + sk->sk_error_report(sk); + + switch (sk->sk_family) { + case AF_INET: + fallthrough; + case AF_INET6: + trace_inet_sk_error_report(sk); + break; + default: + break; + } +} +EXPORT_SYMBOL(sk_error_report); + static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; @@ -776,6 +795,103 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_timestamp(struct sock *sk, int optname, bool valbool) +{ + switch (optname) { + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + } +} + +static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) +{ + struct net *net = sock_net(sk); + struct net_device *dev = NULL; + bool match = false; + int *vclock_index; + int i, num; + + if (sk->sk_bound_dev_if) + dev = dev_get_by_index(net, sk->sk_bound_dev_if); + + if (!dev) { + pr_err("%s: sock not bind to device\n", __func__); + return -EOPNOTSUPP; + } + + num = ethtool_get_phc_vclocks(dev, &vclock_index); + for (i = 0; i < num; i++) { + if (*(vclock_index + i) == phc_index) { + match = true; + break; + } + } + + if (num > 0) + kfree(vclock_index); + + if (!match) + return -EINVAL; + + sk->sk_bind_phc = phc_index; + + return 0; +} + +int sock_set_timestamping(struct sock *sk, int optname, + struct so_timestamping timestamping) +{ + int val = timestamping.flags; + int ret; + + if (val & ~SOF_TIMESTAMPING_MASK) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_BIND_PHC) { + ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); + if (ret) + return ret; + } + + sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + return 0; +} + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -839,6 +955,7 @@ EXPORT_SYMBOL(sock_set_mark); int sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct so_timestamping timestamping; struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; @@ -997,54 +1114,25 @@ set_sndbuf: break; case SO_TIMESTAMP_OLD: - __sock_set_timestamps(sk, valbool, false, false); - break; case SO_TIMESTAMP_NEW: - __sock_set_timestamps(sk, valbool, true, false); - break; case SO_TIMESTAMPNS_OLD: - __sock_set_timestamps(sk, valbool, false, true); - break; case SO_TIMESTAMPNS_NEW: - __sock_set_timestamps(sk, valbool, true, true); + sock_set_timestamp(sk, optname, valbool); break; + case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: - if (val & ~SOF_TIMESTAMPING_MASK) { - ret = -EINVAL; - break; - } - - if (val & SOF_TIMESTAMPING_OPT_ID && - !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { - if ((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN)) { - ret = -EINVAL; - break; - } - sk->sk_tskey = tcp_sk(sk)->snd_una; - } else { - sk->sk_tskey = 0; + if (optlen == sizeof(timestamping)) { + if (copy_from_sockptr(×tamping, optval, + sizeof(timestamping))) { + ret = -EFAULT; + break; } + } else { + memset(×tamping, 0, sizeof(timestamping)); + timestamping.flags = val; } - - if (val & SOF_TIMESTAMPING_OPT_STATS && - !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { - ret = -EINVAL; - break; - } - - sk->sk_tsflags = val; - sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); - - if (val & SOF_TIMESTAMPING_RX_SOFTWARE) - sock_enable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); - else - sock_disable_timestamp(sk, - (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + ret = sock_set_timestamping(sk, optname, timestamping); break; case SO_RCVLOWAT: @@ -1172,7 +1260,7 @@ set_sndbuf: if (val < 0) ret = -EINVAL; else - sk->sk_ll_usec = val; + WRITE_ONCE(sk->sk_ll_usec, val); } break; case SO_PREFER_BUSY_POLL: @@ -1270,6 +1358,15 @@ set_sndbuf: ret = sock_bindtoindex_locked(sk, val); break; + case SO_BUF_LOCK: + if (val & ~SOCK_BUF_LOCK_MASK) { + ret = -EINVAL; + break; + } + sk->sk_userlocks = val | (sk->sk_userlocks & + ~SOCK_BUF_LOCK_MASK); + break; + default: ret = -ENOPROTOOPT; break; @@ -1319,6 +1416,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; struct sock_txtime txtime; + struct so_timestamping timestamping; } v; int lv = sizeof(int); @@ -1422,7 +1520,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_TIMESTAMPING_OLD: - v.val = sk->sk_tsflags; + lv = sizeof(v.timestamping); + v.timestamping.flags = sk->sk_tsflags; + v.timestamping.bind_phc = sk->sk_bind_phc; break; case SO_RCVTIMEO_OLD: @@ -1622,6 +1722,17 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_bound_dev_if; break; + case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + + case SO_BUF_LOCK: + v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2463,7 +2574,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -#define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** @@ -2617,10 +2727,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; bool charged = true; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) + if (memcg_charge && + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge()))) goto suppress_allocation; /* Under limit. */ @@ -2674,8 +2786,14 @@ suppress_allocation: /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { + /* Force charge with __GFP_NOFAIL */ + if (memcg_charge && !charged) { + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); + } return 1; + } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) @@ -2683,7 +2801,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (memcg_charge && charged) mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 6f1b82b8ad49..e252b8ec2b85 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { @@ -211,8 +211,6 @@ out: return psock; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); @@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock *psock; int ret; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (!sock_map_redirect_allowed(sk)) - goto no_progs; - stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } -no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk) sk->sk_protocol == IPPROTO_TCP; } -static bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) @@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); - else if (sk_is_udp(sk)) - return sk_hashed(sk); - - return false; + return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, @@ -1513,6 +1494,7 @@ void sock_map_unhash(struct sock *sk) rcu_read_unlock(); saved_unhash(sk); } +EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_close(struct sock *sk, long timeout) { @@ -1536,6 +1518,7 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } +EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index b065f0a103ed..3f00a28fe762 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -6,6 +6,7 @@ * selecting the socket index from the array of available sockets. */ +#include <net/ip.h> #include <net/sock_reuseport.h> #include <linux/bpf.h> #include <linux/idr.h> @@ -17,6 +18,74 @@ DEFINE_SPINLOCK(reuseport_lock); static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); + +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ + smp_wmb(); + reuse->num_socks++; +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + + return true; +} + +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + + return true; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -49,6 +118,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + /* Only set reuse->bind_inany if the bind_inany is true. * Otherwise, it will overwrite the reuse->bind_inany * which was set by the bind/hash path. @@ -72,9 +147,9 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) } reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; - reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -90,14 +165,30 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) u32 more_socks_size, i; more_socks_size = reuse->max_socks * 2U; - if (more_socks_size > U16_MAX) + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + return NULL; + } more_reuse = __reuseport_alloc(more_socks_size); if (!more_reuse) return NULL; more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; @@ -105,9 +196,13 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); - for (i = 0; i < reuse->num_socks; ++i) + for (i = 0; i < reuse->max_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, more_reuse); @@ -152,13 +247,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)); + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + if (old_reuse && old_reuse->num_socks != 1) { spin_unlock_bh(&reuseport_lock); return -EBUSY; } - if (reuse->num_socks == reuse->max_socks) { + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); if (!reuse) { spin_unlock_bh(&reuseport_lock); @@ -166,10 +269,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } } - reuse->socks[reuse->num_socks] = sk; - /* paired with smp_rmb() in reuseport_select_sock() */ - smp_wmb(); - reuse->num_socks++; + __reuseport_add_sock(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); spin_unlock_bh(&reuseport_lock); @@ -180,15 +280,77 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) } EXPORT_SYMBOL(reuseport_add_sock); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; - int i; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + /* Notify the bpf side. The sk may be added to a sockarray * map. If so, sockarray logic will remove it from the map. * @@ -201,19 +363,52 @@ void reuseport_detach_sock(struct sock *sk) rcu_assign_pointer(sk->sk_reuseport_cb, NULL); - for (i = 0; i < reuse->num_socks; i++) { - if (reuse->socks[i] == sk) { - reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; - reuse->num_socks--; - if (reuse->num_socks == 0) - call_rcu(&reuse->rcu, reuseport_free_rcu); - break; - } - } + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + +out: spin_unlock_bh(&reuseport_lock); } EXPORT_SYMBOL(reuseport_detach_sock); +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + struct bpf_prog *prog; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, struct bpf_prog *prog, struct sk_buff *skb, int hdr_len) @@ -244,6 +439,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, return reuse->socks[index]; } +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= num_socks) + i = 0; + if (i == j) + return NULL; + } + + return reuse->socks[i]; +} + /** * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. * @sk: First socket in the group. @@ -274,32 +486,21 @@ struct sock *reuseport_select_sock(struct sock *sk, prog = rcu_dereference(reuse->prog); socks = READ_ONCE(reuse->num_socks); if (likely(socks)) { - /* paired with smp_wmb() in reuseport_add_sock() */ + /* paired with smp_wmb() in __reuseport_add_sock() */ smp_rmb(); if (!prog || !skb) goto select_by_hash; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); else sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) { - int i, j; - - i = j = reciprocal_scale(hash, socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { - i++; - if (i >= socks) - i = 0; - if (i == j) - goto out; - } - sk2 = reuse->socks[i]; - } + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); } out: @@ -308,14 +509,90 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto failure; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + goto select_by_hash; + goto failure; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto failure; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { + nsk = NULL; + goto failure; + } + +out: + rcu_read_unlock(); + return nsk; + +failure: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + goto out; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (sk_unhashed(sk) && sk->sk_reuseport) { - int err = reuseport_alloc(sk, false); + if (sk_unhashed(sk)) { + int err; + if (!sk->sk_reuseport) + return -EINVAL; + + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -341,13 +618,24 @@ int reuseport_detach_prog(struct sock *sk) struct sock_reuseport *reuse; struct bpf_prog *old_prog; - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return sk->sk_reuseport ? -ENOENT : -EINVAL; - old_prog = NULL; spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + old_prog = rcu_replace_pointer(reuse->prog, old_prog, lockdep_is_held(&reuseport_lock)); spin_unlock_bh(&reuseport_lock); diff --git a/net/core/xdp.c b/net/core/xdp.c index 858276e72c68..cc92ccb38432 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -113,8 +113,13 @@ static void mem_allocator_disconnect(void *allocator) void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; + int type = xdp_rxq->mem.type; int id = xdp_rxq->mem.id; + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { WARN(1, "Missing register, driver bug"); return; @@ -123,7 +128,7 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + if (type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); @@ -144,10 +149,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; - - /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -584,3 +585,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, return __xdp_build_skb_from_frame(xdpf, skb, dev); } EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + nxdpf->mem.id = 0; + + return nxdpf; +} diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 653e3bc9c87b..b441ab330fd3 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1381,7 +1381,7 @@ static int dcbnl_notify(struct net_device *dev, int event, int cmd, skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) - return -ENOBUFS; + return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); @@ -1781,7 +1781,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) - return -ENOBUFS; + return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { @@ -2075,8 +2075,6 @@ EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static int __init dcbnl_init(void) { - INIT_LIST_HEAD(&dcb_app_list); - rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index e2a337fa9ff7..92a8c6bea316 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -688,6 +688,7 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9cc9d1ee6cdb..c5c1d2b8045e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -41,9 +41,9 @@ extern bool dccp_debug; #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else -#define dccp_pr_debug(format, a...) -#define dccp_pr_debug_cat(format, a...) -#define dccp_debug(format, a...) +#define dccp_pr_debug(format, a...) do {} while (0) +#define dccp_pr_debug_cat(format, a...) do {} while (0) +#define dccp_debug(format, a...) do {} while (0) #endif extern struct inet_hashinfo dccp_hashinfo; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ffc601a3b329..0ea29270d7e5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -329,7 +329,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); dccp_done(sk); } else @@ -356,7 +356,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else /* Only an error on timeout */ sk->sk_err_soft = err; out: @@ -977,7 +977,6 @@ static const struct net_protocol dccp_v4_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6f5304db5a67..fa663518fa0e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -172,7 +172,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, * Wake people up to see the error * (see connect in sock.c) */ - sk->sk_error_report(sk); + sk_error_report(sk); dccp_done(sk); } else sk->sk_err_soft = err; @@ -181,7 +181,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else sk->sk_err_soft = err; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 6d705d90c614..abb5c596a817 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -302,7 +302,7 @@ int dccp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } @@ -1126,7 +1126,7 @@ static int __init dccp_init(void) dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; diff --git a/net/dccp/timer.c b/net/dccp/timer.c index db768f223ef7..27a3b37acd2e 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -20,7 +20,7 @@ int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; static void dccp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 5dbd45dc35ad..dc92a67baea3 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -816,7 +816,7 @@ static int dn_auto_bind(struct socket *sock) static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) { struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; if (scp->state != DN_CR) @@ -826,11 +826,11 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk)); dn_send_conn_conf(sk, allocation); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); if (scp->state == DN_CC) - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); lock_sock(sk); err = 0; if (scp->state == DN_RUN) @@ -844,9 +844,8 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) err = -EAGAIN; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); if (err == 0) { sk->sk_socket->state = SS_CONNECTED; } else if (scp->state != DN_CC) { @@ -858,7 +857,7 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) static int dn_wait_run(struct sock *sk, long *timeo) { struct dn_scp *scp = DN_SK(sk); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err = 0; if (scp->state == DN_RUN) @@ -867,11 +866,11 @@ static int dn_wait_run(struct sock *sk, long *timeo) if (!*timeo) return -EALREADY; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); if (scp->state == DN_CI || scp->state == DN_CC) - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); lock_sock(sk); err = 0; if (scp->state == DN_RUN) @@ -885,9 +884,8 @@ static int dn_wait_run(struct sock *sk, long *timeo) err = -ETIMEDOUT; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); out: if (err == 0) { sk->sk_socket->state = SS_CONNECTED; @@ -1032,16 +1030,16 @@ static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt) static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sk_buff *skb = NULL; int err = 0; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); for(;;) { release_sock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { - *timeo = schedule_timeout(*timeo); + *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo); skb = skb_dequeue(&sk->sk_receive_queue); } lock_sock(sk); @@ -1056,9 +1054,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) err = -EAGAIN; if (!*timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return skb == NULL ? ERR_PTR(err) : skb; } diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index d1c50a48614b..0ee7d4c0c955 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -521,8 +521,7 @@ int dn_dev_set_default(struct net_device *dev, int force) } spin_unlock(&dndev_lock); - if (old) - dev_put(old); + dev_put(old); return rv; } @@ -536,8 +535,7 @@ static void dn_dev_check_default(struct net_device *dev) } spin_unlock(&dndev_lock); - if (dev) - dev_put(dev); + dev_put(dev); } /* diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 77fbf8e9df4b..269c029ad74f 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -92,8 +92,7 @@ void dn_fib_free_info(struct dn_fib_info *fi) } change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); + dev_put(nh->nh_dev); nh->nh_dev = NULL; } endfor_nexthops(fi); kfree(fi); @@ -102,7 +101,7 @@ void dn_fib_free_info(struct dn_fib_info *fi) void dn_fib_release_info(struct dn_fib_info *fi) { spin_lock(&dn_fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { if (fi->fib_next) fi->fib_next->fib_prev = fi->fib_prev; if (fi->fib_prev) @@ -385,11 +384,11 @@ link_it: if ((ofi = dn_fib_find_info(fi)) != NULL) { fi->fib_dead = 1; dn_fib_free_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock(&dn_fib_info_lock); fi->fib_next = dn_fib_info_list; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1a12912b88d6..7ab788f41a3f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -870,7 +870,7 @@ int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) /* * Read out ack data here, this applies equally - * to data, other data, link serivce and both + * to data, other data, link service and both * ack data and ack otherdata. */ dn_process_ack(sk, skb, other); diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 00f2ed721ec1..eadc89583168 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -179,7 +179,7 @@ static void dn_nsp_rtt(struct sock *sk, long rtt) scp->nsp_srtt = 1; /* - * Add new rtt varience to smoothed varience + * Add new rtt variance to smoothed varience */ delta >>= 1; rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 32b1bed8ae51..7e85f2a1ae25 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -604,7 +604,7 @@ drop_it: static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb) { /* - * I know we drop the packet here, but thats considered success in + * I know we drop the packet here, but that's considered success in * this case */ kfree_skb(skb); @@ -1026,8 +1026,7 @@ source_ok: if (!fld.daddr) { fld.daddr = fld.saddr; - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); err = -EINVAL; dev_out = init_net.loopback_dev; if (!dev_out->dn_ptr) @@ -1084,8 +1083,7 @@ source_ok: neigh_release(neigh); neigh = NULL; } else { - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); if (dn_dev_islocal(neigh->dev, fld.daddr)) { dev_out = init_net.loopback_dev; res.type = RTN_LOCAL; @@ -1144,8 +1142,7 @@ select_source: if (res.type == RTN_LOCAL) { if (!fld.saddr) fld.saddr = fld.daddr; - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); dev_out = init_net.loopback_dev; dev_hold(dev_out); if (!dev_out->dn_ptr) @@ -1168,8 +1165,7 @@ select_source: if (!fld.saddr) fld.saddr = DN_FIB_RES_PREFSRC(res); - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); dev_out = DN_FIB_RES_DEV(res); dev_hold(dev_out); fld.flowidn_oif = dev_out->ifindex; @@ -1222,8 +1218,7 @@ done: neigh_release(neigh); if (free_res) dn_fib_res_put(&res); - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); out: return err; @@ -1503,8 +1498,7 @@ done: if (free_res) dn_fib_res_put(&res); dev_put(in_dev); - if (out_dev) - dev_put(out_dev); + dev_put(out_dev); out: return err; diff --git a/net/devres.c b/net/devres.c index 1f9be2133787..5ccf6ca311dc 100644 --- a/net/devres.c +++ b/net/devres.c @@ -60,7 +60,7 @@ static int netdev_devres_match(struct device *dev, void *this, void *match_data) * @ndev: device to register * * This is a devres variant of register_netdev() for which the unregister - * function will be call automatically when the managing device is + * function will be called automatically when the managing device is * detached. Note: the net_device used must also be resource managed by * the same struct device. */ diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 00bb89b2d86f..548285539752 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -18,16 +18,6 @@ if NET_DSA # Drivers must select the appropriate tagging format(s) -config NET_DSA_TAG_8021Q - tristate - select VLAN_8021Q - help - Unlike the other tagging protocols, the 802.1Q config option simply - provides helpers for other tagging implementations that might rely on - VLAN in one way or another. It is not a complete solution. - - Drivers which use these helpers should select this as dependency. - config NET_DSA_TAG_AR9331 tristate "Tag driver for Atheros AR9331 SoC with built-in switch" help @@ -126,7 +116,6 @@ config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" depends on MSCC_OCELOT_SWITCH_LIB || \ (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) - select NET_DSA_TAG_8021Q help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -149,7 +138,7 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - select NET_DSA_TAG_8021Q + depends on NET_DSA_SJA1105 || !NET_DSA_SJA1105 select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 44bc79952b8b..67ea009f242c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,10 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o tag_8021q.o # tagging formats -obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 84cad1be9ce4..1dc45e40f961 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -238,7 +238,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = cpu_dp->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b71e87909f0e..1b2b25d7bd02 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,6 +21,9 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); +/* Track the bridges with forwarding offload enabled */ +static unsigned long dsa_fwd_offloading_bridges; + /** * dsa_tree_notify - Execute code for all switches in a DSA switch tree. * @dst: collection of struct dsa_switch devices to notify. @@ -49,6 +52,9 @@ int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v) * Can be used to notify the switching fabric of events such as cross-chip * bridging between disjoint trees (such as islands of tagger-compatible * switches bridged by an incompatible middle switch). + * + * WARNING: this function is not reliable during probe time, because probing + * between trees is asynchronous and not all DSA trees might have probed. */ int dsa_broadcast(unsigned long e, void *v) { @@ -123,6 +129,51 @@ void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag) } } +static int dsa_bridge_num_find(const struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst; + struct dsa_port *dp; + + /* When preparing the offload for a port, it will have a valid + * dp->bridge_dev pointer but a not yet valid dp->bridge_num. + * However there might be other ports having the same dp->bridge_dev + * and a valid dp->bridge_num, so just ignore this port. + */ + list_for_each_entry(dst, &dsa_tree_list, list) + list_for_each_entry(dp, &dst->ports, list) + if (dp->bridge_dev == bridge_dev && + dp->bridge_num != -1) + return dp->bridge_num; + + return -1; +} + +int dsa_bridge_num_get(const struct net_device *bridge_dev, int max) +{ + int bridge_num = dsa_bridge_num_find(bridge_dev); + + if (bridge_num < 0) { + /* First port that offloads TX forwarding for this bridge */ + bridge_num = find_first_zero_bit(&dsa_fwd_offloading_bridges, + DSA_MAX_NUM_OFFLOADING_BRIDGES); + if (bridge_num >= max) + return -1; + + set_bit(bridge_num, &dsa_fwd_offloading_bridges); + } + + return bridge_num; +} + +void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num) +{ + /* Check if the bridge is still in use, otherwise it is time + * to clean it up so we can reuse this bridge_num later. + */ + if (!dsa_bridge_num_find(bridge_dev)) + clear_bit(bridge_num, &dsa_fwd_offloading_bridges); +} + struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) { struct dsa_switch_tree *dst; @@ -219,21 +270,6 @@ static void dsa_tree_put(struct dsa_switch_tree *dst) kref_put(&dst->refcount, dsa_tree_release); } -static bool dsa_port_is_dsa(struct dsa_port *port) -{ - return port->type == DSA_PORT_TYPE_DSA; -} - -static bool dsa_port_is_cpu(struct dsa_port *port) -{ - return port->type == DSA_PORT_TYPE_CPU; -} - -static bool dsa_port_is_user(struct dsa_port *dp) -{ - return dp->type == DSA_PORT_TYPE_USER; -} - static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, struct device_node *dn) { @@ -326,6 +362,9 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) return NULL; } +/* Assign the default CPU port (the first one in the tree) to all ports of the + * fabric which don't already have one as part of their own switch. + */ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { struct dsa_port *cpu_dp, *dp; @@ -336,15 +375,48 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) return -EINVAL; } - /* Assign the default CPU port to all ports of the fabric */ - list_for_each_entry(dp, &dst->ports, list) + list_for_each_entry(dp, &dst->ports, list) { + if (dp->cpu_dp) + continue; + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) dp->cpu_dp = cpu_dp; + } return 0; } -static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) +/* Perform initial assignment of CPU ports to user ports and DSA links in the + * fabric, giving preference to CPU ports local to each switch. Default to + * using the first CPU port in the switch tree if the port does not have a CPU + * port local to this switch. + */ +static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp, *dp; + + list_for_each_entry(cpu_dp, &dst->ports, list) { + if (!dsa_port_is_cpu(cpu_dp)) + continue; + + list_for_each_entry(dp, &dst->ports, list) { + /* Prefer a local CPU port */ + if (dp->ds != cpu_dp->ds) + continue; + + /* Prefer the first local CPU port found */ + if (dp->cpu_dp) + continue; + + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; + } + } + + return dsa_tree_setup_default_cpu(dst); +} + +static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) { struct dsa_port *dp; @@ -363,6 +435,9 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; + INIT_LIST_HEAD(&dp->fdbs); + INIT_LIST_HEAD(&dp->mdbs); + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -458,6 +533,7 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + struct dsa_mac_addr *a, *tmp; if (!dp->setup) return; @@ -483,6 +559,16 @@ static void dsa_port_teardown(struct dsa_port *dp) break; } + list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { + list_del(&a->list); + kfree(a); + } + + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { + list_del(&a->list); + kfree(a); + } + dp->setup = false; } @@ -711,13 +797,14 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); + ds->devlink = + devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev); if (!ds->devlink) return -ENOMEM; dl_priv = devlink_priv(ds->devlink); dl_priv->ds = ds; - err = devlink_register(ds->devlink, ds->dev); + err = devlink_register(ds->devlink); if (err) goto free_devlink; @@ -922,13 +1009,13 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) if (!complete) return 0; - err = dsa_tree_setup_default_cpu(dst); + err = dsa_tree_setup_cpu_ports(dst); if (err) return err; err = dsa_tree_setup_switches(dst); if (err) - goto teardown_default_cpu; + goto teardown_cpu_ports; err = dsa_tree_setup_master(dst); if (err) @@ -948,8 +1035,8 @@ teardown_master: dsa_tree_teardown_master(dst); teardown_switches: dsa_tree_teardown_switches(dst); -teardown_default_cpu: - dsa_tree_teardown_default_cpu(dst); +teardown_cpu_ports: + dsa_tree_teardown_cpu_ports(dst); return err; } @@ -967,7 +1054,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_switches(dst); - dsa_tree_teardown_default_cpu(dst); + dsa_tree_teardown_cpu_ports(dst); list_for_each_entry_safe(dl, next, &dst->rtable, list) { list_del(&dl->list); @@ -1045,6 +1132,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) dp->ds = ds; dp->index = index; + dp->bridge_num = -1; INIT_LIST_HEAD(&dp->list); list_add_tail(&dp->list, &dst->ports); @@ -1259,6 +1347,16 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, if (!ds->dst) return -ENOMEM; + if (dsa_switch_find(ds->dst->index, ds->index)) { + dev_err(ds->dev, + "A DSA switch with index %d already exists in tree %d\n", + ds->index, ds->dst->index); + return -EEXIST; + } + + if (ds->dst->last_switch < ds->index) + ds->dst->last_switch = ds->index; + return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 92282de54230..33ab7d7af9eb 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,12 +14,16 @@ #include <net/dsa.h> #include <net/gro_cells.h> +#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG + enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, DSA_NOTIFIER_BRIDGE_LEAVE, DSA_NOTIFIER_FDB_ADD, DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_HOST_FDB_ADD, + DSA_NOTIFIER_HOST_FDB_DEL, DSA_NOTIFIER_HSR_JOIN, DSA_NOTIFIER_HSR_LEAVE, DSA_NOTIFIER_LAG_CHANGE, @@ -27,6 +31,8 @@ enum { DSA_NOTIFIER_LAG_LEAVE, DSA_NOTIFIER_MDB_ADD, DSA_NOTIFIER_MDB_DEL, + DSA_NOTIFIER_HOST_MDB_ADD, + DSA_NOTIFIER_HOST_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, DSA_NOTIFIER_MTU, @@ -35,6 +41,8 @@ enum { DSA_NOTIFIER_MRP_DEL, DSA_NOTIFIER_MRP_ADD_RING_ROLE, DSA_NOTIFIER_MRP_DEL_RING_ROLE, + DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, + DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -84,7 +92,7 @@ struct dsa_notifier_vlan_info { /* DSA_NOTIFIER_MTU */ struct dsa_notifier_mtu_info { - bool propagate_upstream; + bool targeted_match; int sw_index; int port; int mtu; @@ -109,9 +117,18 @@ struct dsa_notifier_mrp_ring_role_info { int port; }; +/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */ +struct dsa_notifier_tag_8021q_vlan_info { + int tree_index; + int sw_index; + int port; + u16 vid; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; + struct net_device *dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and @@ -119,6 +136,7 @@ struct dsa_switchdev_event_work { */ unsigned char addr[ETH_ALEN]; u16 vid; + bool host_addr; }; /* DSA_NOTIFIER_HSR_* */ @@ -154,6 +172,11 @@ const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); +static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) +{ + return ops->needed_headroom + ops->needed_tailroom; +} + /* master.c */ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); void dsa_master_teardown(struct net_device *dev); @@ -176,43 +199,51 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* port.c */ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops); -int dsa_port_set_state(struct dsa_port *dp, u8 state); +int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age); int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack); +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_lag_change(struct dsa_port *dp, struct netdev_lag_lower_state_info *linfo); int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack); +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack); bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, - bool propagate_upstream); + bool targeted_match); int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); +int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); int dsa_port_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb); +int dsa_port_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); int dsa_port_pre_bridge_flags(const struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -int dsa_port_bridge_flags(const struct dsa_port *dp, +int dsa_port_bridge_flags(struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); @@ -230,16 +261,18 @@ int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, - struct net_device *dev) + const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, - struct net_device *bridge_dev) + const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. @@ -249,7 +282,7 @@ static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, - struct net_device *dev) + const struct net_device *dev) { struct dsa_port *dp; @@ -260,6 +293,19 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, return false; } +/* Returns true if any port of this tree offloads the given bridge */ +static inline bool dsa_tree_offloads_bridge(struct dsa_switch_tree *dst, + const struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_bridge(dp, bridge_dev)) + return true; + + return false; +} + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; extern struct notifier_block dsa_slave_switchdev_notifier; @@ -274,6 +320,8 @@ int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); void dsa_slave_setup_tagger(struct net_device *slave); int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); +int dsa_slave_manage_vlan_filtering(struct net_device *dev, + bool vlan_filtering); static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { @@ -349,6 +397,141 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) return skb; } +/* For switches without hardware support for DSA tagging to be able + * to support termination through the bridge. + */ +static inline struct net_device * +dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct bridge_vlan_info vinfo; + struct net_device *slave; + struct dsa_port *dp; + int err; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->type != DSA_PORT_TYPE_USER) + continue; + + if (!dp->bridge_dev) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + /* Since the bridge might learn this packet, keep the CPU port + * affinity with the port that will be used for the reply on + * xmit. + */ + if (dp->cpu_dp != cpu_dp) + continue; + + slave = dp->slave; + + err = br_vlan_get_info_rcu(slave, vid, &vinfo); + if (err) + continue; + + return slave; + } + + return NULL; +} + +/* If the ingress port offloads the bridge, we mark the frame as autonomously + * forwarded by hardware, so the software bridge doesn't forward in twice, back + * to us, because we already did. However, if we're in fallback mode and we do + * software bridging, we are not offloading it, therefore the dp->bridge_dev + * pointer is not populated, and flooding needs to be done by software (we are + * effectively operating in standalone ports mode). + */ +static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + + skb->offload_fwd_mark = !!(dp->bridge_dev); +} + +/* Helper for removing DSA header tags from packets in the RX path. + * Must not be called before skb_pull(len). + * skb->data + * | + * v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * | | + * <----- len -----> <----- len -----> + * | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ + * | + * skb->data + */ +static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); +} + +/* Helper for creating space for DSA header tags in TX path packets. + * Must not be called before skb_push(len). + * + * Before: + * + * <<<<<<< | | | | | | | | | | | | | | | + * ^ <<<<<<< +-----------------------+-----------------------+-------+ + * | <<<<<<< | Destination MAC | Source MAC | EType | + * | +-----------------------+-----------------------+-------+ + * <----- len -----> + * | + * | + * skb->data + * + * After: + * + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * ^ | | + * | <----- len -----> + * skb->data + */ +static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data, skb->data + len, 2 * ETH_ALEN); +} + +/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from + * skb_mac_header(skb), which leaves skb->data pointing at the first byte after + * what the DSA master perceives as the EtherType (the beginning of the L3 + * protocol). Since DSA EtherType header taggers treat the EtherType as part of + * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header + * is located 2 bytes behind skb->data. Note that EtherType in this context + * means the first 2 bytes of the DSA header, not the encapsulated EtherType + * that will become visible after the DSA header is stripped. + */ +static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) +{ + return skb->data - 2; +} + +/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType + * header taggers start exactly where the EtherType is (the EtherType is + * treated as part of the DSA header). + */ +static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) +{ + return skb->data + 2 * ETH_ALEN; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); @@ -362,6 +545,18 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, struct net_device *master, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops); +int dsa_bridge_num_get(const struct net_device *bridge_dev, int max); +void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num); + +/* tag_8021q.c */ +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); extern struct list_head dsa_tree_list; diff --git a/net/dsa/master.c b/net/dsa/master.c index 63adbc21a735..e8e19857621b 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -210,14 +210,14 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (dev->netdev_ops->ndo_do_ioctl) - err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); + if (dev->netdev_ops->ndo_eth_ioctl) + err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd); return err; } static const struct dsa_netdevice_ops dsa_netdev_ops = { - .ndo_do_ioctl = dsa_master_ioctl, + .ndo_eth_ioctl = dsa_master_ioctl, }; static int dsa_master_ethtool_setup(struct net_device *dev) @@ -346,10 +346,12 @@ static struct lock_class_key dsa_master_addr_list_lock_key; int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { - int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops; struct dsa_switch *ds = cpu_dp->ds; struct device_link *consumer_link; - int ret; + int mtu, ret; + + mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops); /* The DSA master must use SET_NETDEV_DEV for this to work. */ consumer_link = device_link_add(ds->dev, dev->dev.parent, diff --git a/net/dsa/port.c b/net/dsa/port.c index 6379d66a6bb3..616330a16d31 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -30,7 +30,52 @@ static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) return dsa_tree_notify(dp->ds->dst, e, v); } -int dsa_port_set_state(struct dsa_port *dp, u8 state) +static void dsa_port_notify_bridge_fdb_flush(const struct dsa_port *dp) +{ + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + struct switchdev_notifier_fdb_info info = { + /* flush all VLANs */ + .vid = 0, + }; + + /* When the port becomes standalone it has already left the bridge. + * Don't notify the bridge in that case. + */ + if (!brport_dev) + return; + + call_switchdev_notifiers(SWITCHDEV_FDB_FLUSH_TO_BRIDGE, + brport_dev, &info.info, NULL); +} + +static void dsa_port_fast_age(const struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_fast_age) + return; + + ds->ops->port_fast_age(ds, dp->index); + + dsa_port_notify_bridge_fdb_flush(dp); +} + +static bool dsa_port_can_configure_learning(struct dsa_port *dp) +{ + struct switchdev_brport_flags flags = { + .mask = BR_LEARNING, + }; + struct dsa_switch *ds = dp->ds; + int err; + + if (!ds->ops->port_bridge_flags || !ds->ops->port_pre_bridge_flags) + return false; + + err = ds->ops->port_pre_bridge_flags(ds, dp->index, flags, NULL); + return !err; +} + +int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age) { struct dsa_switch *ds = dp->ds; int port = dp->index; @@ -40,10 +85,14 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) ds->ops->port_stp_state_set(ds, port, state); - if (ds->ops->port_fast_age) { + if (!dsa_port_can_configure_learning(dp) || + (do_fast_age && dp->learning)) { /* Fast age FDB entries or flush appropriate forwarding database * for the given port, if we are moving it from Learning or * Forwarding state, to Disabled or Blocking or Listening state. + * Ports that were standalone before the STP state change don't + * need to fast age the FDB, since address learning is off in + * standalone mode. */ if ((dp->stp_state == BR_STATE_LEARNING || @@ -51,7 +100,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) (state == BR_STATE_DISABLED || state == BR_STATE_BLOCKING || state == BR_STATE_LISTENING)) - ds->ops->port_fast_age(ds, port); + dsa_port_fast_age(dp); } dp->stp_state = state; @@ -59,11 +108,12 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) return 0; } -static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state, + bool do_fast_age) { int err; - err = dsa_port_set_state(dp, state); + err = dsa_port_set_state(dp, state, do_fast_age); if (err) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } @@ -81,7 +131,7 @@ int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy) } if (!dp->bridge_dev) - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + dsa_port_set_state_now(dp, BR_STATE_FORWARDING, false); if (dp->pl) phylink_start(dp->pl); @@ -109,7 +159,7 @@ void dsa_port_disable_rt(struct dsa_port *dp) phylink_stop(dp->pl); if (!dp->bridge_dev) - dsa_port_set_state_now(dp, BR_STATE_DISABLED); + dsa_port_set_state_now(dp, BR_STATE_DISABLED, false); if (ds->ops->port_disable) ds->ops->port_disable(ds, port); @@ -167,8 +217,8 @@ static void dsa_port_clear_brport_flags(struct dsa_port *dp) } } -static int dsa_port_switchdev_sync(struct dsa_port *dp, - struct netlink_ext_ack *extack) +static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp, + struct netlink_ext_ack *extack) { struct net_device *brport_dev = dsa_port_to_bridge_port(dp); struct net_device *br = dp->bridge_dev; @@ -178,7 +228,7 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err) return err; - err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev)); + err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev), false); if (err && err != -EOPNOTSUPP) return err; @@ -186,34 +236,14 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; - err = dsa_port_mrouter(dp->cpu_dp, br_multicast_router(br), extack); - if (err && err != -EOPNOTSUPP) - return err; - err = dsa_port_ageing_time(dp, br_get_ageing_time(br)); if (err && err != -EOPNOTSUPP) return err; - err = br_mdb_replay(br, brport_dev, - &dsa_slave_switchdev_blocking_notifier, - extack); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_vlan_replay(br, brport_dev, - &dsa_slave_switchdev_blocking_notifier, - extack); - if (err && err != -EOPNOTSUPP) - return err; - return 0; } -static void dsa_port_switchdev_unsync(struct dsa_port *dp) +static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp) { /* Configure the port for standalone mode (no address learning, * flood everything). @@ -231,21 +261,63 @@ static void dsa_port_switchdev_unsync(struct dsa_port *dp) /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + dsa_port_set_state_now(dp, BR_STATE_FORWARDING, true); /* VLAN filtering is handled by dsa_switch_bridge_leave */ - /* Some drivers treat the notification for having a local multicast - * router by allowing multicast to be flooded to the CPU, so we should - * allow this in standalone mode too. - */ - dsa_port_mrouter(dp->cpu_dp, true, NULL); - /* Ageing time may be global to the switch chip, so don't change it * here because we have no good reason (or value) to change it to. */ } +static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + int bridge_num = dp->bridge_num; + struct dsa_switch *ds = dp->ds; + + /* No bridge TX forwarding offload => do nothing */ + if (!ds->ops->port_bridge_tx_fwd_unoffload || dp->bridge_num == -1) + return; + + dp->bridge_num = -1; + + dsa_bridge_num_put(bridge_dev, bridge_num); + + /* Notify the chips only once the offload has been deactivated, so + * that they can update their configuration accordingly. + */ + ds->ops->port_bridge_tx_fwd_unoffload(ds, dp->index, bridge_dev, + bridge_num); +} + +static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch *ds = dp->ds; + int bridge_num, err; + + if (!ds->ops->port_bridge_tx_fwd_offload) + return false; + + bridge_num = dsa_bridge_num_get(bridge_dev, + ds->num_fwd_offloading_bridges); + if (bridge_num < 0) + return false; + + dp->bridge_num = bridge_num; + + /* Notify the driver */ + err = ds->ops->port_bridge_tx_fwd_offload(ds, dp->index, bridge_dev, + bridge_num); + if (err) { + dsa_port_bridge_tx_fwd_unoffload(dp, bridge_dev); + return false; + } + + return true; +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack) { @@ -255,6 +327,9 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, .port = dp->index, .br = br, }; + struct net_device *dev = dp->slave; + struct net_device *brport_dev; + bool tx_fwd_offload; int err; /* Here the interface is already bridged. Reflect the current @@ -262,16 +337,31 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, */ dp->bridge_dev = br; + brport_dev = dsa_port_to_bridge_port(dp); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); if (err) goto out_rollback; - err = dsa_port_switchdev_sync(dp, extack); + tx_fwd_offload = dsa_port_bridge_tx_fwd_offload(dp, br); + + err = switchdev_bridge_port_offload(brport_dev, dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier, + tx_fwd_offload, extack); if (err) goto out_rollback_unbridge; + err = dsa_port_switchdev_sync_attrs(dp, extack); + if (err) + goto out_rollback_unoffload; + return 0; +out_rollback_unoffload: + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); out_rollback_unbridge: dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); out_rollback: @@ -279,6 +369,19 @@ out_rollback: return err; } +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br) +{ + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + + /* Don't try to unoffload something that is not offloaded */ + if (!brport_dev) + return; + + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); +} + void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -294,11 +397,15 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; + dsa_port_bridge_tx_fwd_unoffload(dp, br); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_BRIDGE_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); - dsa_port_switchdev_unsync(dp); + dsa_port_switchdev_unsync_attrs(dp); } int dsa_port_lag_change(struct dsa_port *dp, @@ -366,6 +473,12 @@ err_lag_join: return err; } +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag) +{ + if (dp->bridge_dev) + dsa_port_pre_bridge_leave(dp, dp->bridge_dev); +} + void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) { struct dsa_notifier_lag_info info = { @@ -389,8 +502,9 @@ void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_LAG_LEAVE: %d\n", - err); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_LAG_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); dsa_lag_unmap(dp->ds->dst, lag); } @@ -466,6 +580,7 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack) { + bool old_vlan_filtering = dsa_port_is_vlan_filtering(dp); struct dsa_switch *ds = dp->ds; bool apply; int err; @@ -491,12 +606,49 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, if (err) return err; - if (ds->vlan_filtering_is_global) + if (ds->vlan_filtering_is_global) { + int port; + ds->vlan_filtering = vlan_filtering; - else + + for (port = 0; port < ds->num_ports; port++) { + struct net_device *slave; + + if (!dsa_is_user_port(ds, port)) + continue; + + /* We might be called in the unbind path, so not + * all slave devices might still be registered. + */ + slave = dsa_to_port(ds, port)->slave; + if (!slave) + continue; + + err = dsa_slave_manage_vlan_filtering(slave, + vlan_filtering); + if (err) + goto restore; + } + } else { dp->vlan_filtering = vlan_filtering; + err = dsa_slave_manage_vlan_filtering(dp->slave, + vlan_filtering); + if (err) + goto restore; + } + return 0; + +restore: + ds->ops->port_vlan_filtering(ds, dp->index, old_vlan_filtering, NULL); + + if (ds->vlan_filtering_is_global) + ds->vlan_filtering = old_vlan_filtering; + else + dp->vlan_filtering = old_vlan_filtering; + + return err; } /* This enforces legacy behavior for switch drivers which assume they can't @@ -543,35 +695,43 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp, return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack); } -int dsa_port_bridge_flags(const struct dsa_port *dp, +int dsa_port_bridge_flags(struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; + int err; if (!ds->ops->port_bridge_flags) return -EOPNOTSUPP; - return ds->ops->port_bridge_flags(ds, dp->index, flags, extack); -} + err = ds->ops->port_bridge_flags(ds, dp->index, flags, extack); + if (err) + return err; -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dp->ds; + if (flags.mask & BR_LEARNING) { + bool learning = flags.val & BR_LEARNING; - if (!ds->ops->port_set_mrouter) - return -EOPNOTSUPP; + if (learning == dp->learning) + return 0; + + if ((dp->learning && !learning) && + (dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING)) + dsa_port_fast_age(dp); + + dp->learning = learning; + } - return ds->ops->port_set_mrouter(ds, dp->index, mrouter, extack); + return 0; } int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, - bool propagate_upstream) + bool targeted_match) { struct dsa_notifier_mtu_info info = { .sw_index = dp->ds->index, - .propagate_upstream = propagate_upstream, + .targeted_match = targeted_match, .port = dp->index, .mtu = new_mtu, }; @@ -606,6 +766,44 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } +int dsa_port_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .addr = addr, + .vid = vid, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_uc_add(cpu_dp->master, addr); + if (err) + return err; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info); +} + +int dsa_port_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid) +{ + struct dsa_notifier_fdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .addr = addr, + .vid = vid, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_uc_del(cpu_dp->master, addr); + if (err) + return err; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info); +} + int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) { struct dsa_switch *ds = dp->ds; @@ -641,6 +839,42 @@ int dsa_port_mdb_del(const struct dsa_port *dp, return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info); } +int dsa_port_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_mc_add(cpu_dp->master, mdb->addr); + if (err) + return err; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info); +} + +int dsa_port_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_notifier_mdb_info info = { + .sw_index = dp->ds->index, + .port = dp->index, + .mdb = mdb, + }; + struct dsa_port *cpu_dp = dp->cpu_dp; + int err; + + err = dev_mc_del(cpu_dp->master, mdb->addr); + if (err) + return err; + + return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info); +} + int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack) @@ -718,7 +952,6 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { - cpu_dp->filter = tag_ops->filter; cpu_dp->rcv = tag_ops->rcv; cpu_dp->tag_ops = tag_ops; } @@ -1089,5 +1322,42 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n"); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_HSR_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); +} + +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + + if (broadcast) + return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info); + + return dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info); +} + +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + int err; + + if (broadcast) + err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info); + else + err = dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info); + if (err) + dev_err(dp->ds->dev, + "port %d failed to notify tag_8021q VLAN %d deletion: %pe\n", + dp->index, vid, ERR_PTR(err)); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d4756b920108..662ff531d4e2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -271,19 +271,22 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } -static int dsa_slave_port_attr_set(struct net_device *dev, +static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int ret; + if (ctx && ctx != dp) + return 0; + switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; - ret = dsa_port_set_state(dp, attr->u.stp_state); + ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) @@ -311,12 +314,6 @@ static int dsa_slave_port_attr_set(struct net_device *dev, ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; - case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) - return -EOPNOTSUPP; - - ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack); - break; default: ret = -EOPNOTSUPP; break; @@ -394,13 +391,16 @@ static int dsa_slave_vlan_add(struct net_device *dev, return vlan_vid_add(master, htons(ETH_P_8021Q), vlan.vid); } -static int dsa_slave_port_obj_add(struct net_device *dev, +static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; + if (ctx && ctx != dp) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -412,10 +412,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; - /* DSA can directly translate this to a normal MDB add, - * but on the CPU port. - */ - err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -469,12 +466,15 @@ static int dsa_slave_vlan_del(struct net_device *dev, return 0; } -static int dsa_slave_port_obj_del(struct net_device *dev, +static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_slave_to_port(dev); int err; + if (ctx && ctx != dp) + return 0; + switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -486,10 +486,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev, if (!dsa_port_offloads_bridge(dp, obj->orig_dev)) return -EOPNOTSUPP; - /* DSA can directly translate this to a normal MDB add, - * but on the CPU port. - */ - err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj)); + err = dsa_port_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) @@ -1412,6 +1409,76 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, return 0; } +static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg) +{ + __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); + + return dsa_slave_vlan_rx_add_vid(arg, proto, vid); +} + +static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg) +{ + __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); + + return dsa_slave_vlan_rx_kill_vid(arg, proto, vid); +} + +/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN + * filtering is enabled. The baseline is that only ports that offload a + * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, + * but there are exceptions for quirky hardware. + * + * If ds->vlan_filtering_is_global = true, then standalone ports which share + * the same switch with other ports that offload a VLAN-aware bridge are also + * inevitably VLAN-aware. + * + * To summarize, a DSA switch port offloads: + * + * - If standalone (this includes software bridge, software LAG): + * - if ds->needs_standalone_vlan_filtering = true, OR if + * (ds->vlan_filtering_is_global = true AND there are bridges spanning + * this switch chip which have vlan_filtering=1) + * - the 8021q upper VLANs + * - else (standalone VLAN filtering is not needed, VLAN filtering is not + * global, or it is, but no port is under a VLAN-aware bridge): + * - no VLAN (any 8021q upper is a software VLAN) + * + * - If under a vlan_filtering=0 bridge which it offload: + * - if ds->configure_vlan_while_not_filtering = true (default): + * - the bridge VLANs. These VLANs are committed to hardware but inactive. + * - else (deprecated): + * - no VLAN. The bridge VLANs are not restored when VLAN awareness is + * enabled, so this behavior is broken and discouraged. + * + * - If under a vlan_filtering=1 bridge which it offload: + * - the bridge VLANs + * - the 8021q upper VLANs + */ +int dsa_slave_manage_vlan_filtering(struct net_device *slave, + bool vlan_filtering) +{ + int err; + + if (vlan_filtering) { + slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + + err = vlan_for_each(slave, dsa_slave_restore_vlan, slave); + if (err) { + vlan_for_each(slave, dsa_slave_clear_vlan, slave); + slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + return err; + } + } else { + err = vlan_for_each(slave, dsa_slave_clear_vlan, slave); + if (err) + return err; + + slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + } + + return 0; +} + struct dsa_hw_port { struct list_head list; struct net_device *dev; @@ -1528,6 +1595,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp_iter; struct dsa_port *cpu_dp; int port = p->dp->index; int largest_mtu = 0; @@ -1535,31 +1603,31 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) int old_master_mtu; int mtu_limit; int cpu_mtu; - int err, i; + int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; - for (i = 0; i < ds->num_ports; i++) { + list_for_each_entry(dp_iter, &ds->dst->ports, list) { int slave_mtu; - if (!dsa_is_user_port(ds, i)) + if (!dsa_port_is_user(dp_iter)) continue; /* During probe, this function will be called for each slave * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ - if (!dsa_to_port(ds, i)->slave) + if (!dp_iter->slave) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ - if (i == port) + if (dp_iter == dp) slave_mtu = new_mtu; else - slave_mtu = dsa_to_port(ds, i)->slave->mtu; + slave_mtu = dp_iter->slave->mtu; if (largest_mtu < slave_mtu) largest_mtu = slave_mtu; @@ -1569,7 +1637,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) mtu_limit = min_t(int, master->max_mtu, dev->max_mtu); old_master_mtu = master->mtu; - new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead; + new_master_mtu = largest_mtu + dsa_tag_protocol_overhead(cpu_dp->tag_ops); if (new_master_mtu > mtu_limit) return -ERANGE; @@ -1585,14 +1653,15 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) goto out_master_failed; /* We only need to propagate the MTU of the CPU port to - * upstream switches. + * upstream switches, so create a non-targeted notifier which + * updates all switches. */ - err = dsa_port_mtu_change(cpu_dp, cpu_mtu, true); + err = dsa_port_mtu_change(cpu_dp, cpu_mtu, false); if (err) goto out_cpu_failed; } - err = dsa_port_mtu_change(dp, new_mtu, false); + err = dsa_port_mtu_change(dp, new_mtu, true); if (err) goto out_port_failed; @@ -1605,8 +1674,8 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) out_port_failed: if (new_master_mtu != old_master_mtu) dsa_port_mtu_change(cpu_dp, old_master_mtu - - cpu_dp->tag_ops->overhead, - true); + dsa_tag_protocol_overhead(cpu_dp->tag_ops), + false); out_cpu_failed: if (new_master_mtu != old_master_mtu) dev_set_mtu(master, old_master_mtu); @@ -1640,27 +1709,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .self_test = dsa_slave_net_selftest, }; -/* legacy way, bypassing the bridge *****************************************/ -static int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, - u16 flags, - struct netlink_ext_ack *extack) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_add(dp, addr, vid); -} - -static int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_fdb_del(dp, addr, vid); -} - static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1702,10 +1750,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = dsa_legacy_fdb_add, - .ndo_fdb_del = dsa_legacy_fdb_del, .ndo_fdb_dump = dsa_slave_fdb_dump, - .ndo_do_ioctl = dsa_slave_ioctl, + .ndo_eth_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_slave_netpoll_setup, @@ -1749,7 +1795,8 @@ static void dsa_slave_phylink_fixed_state(struct phylink_config *config, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr, + u32 flags) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_switch *ds = dp->ds; @@ -1760,6 +1807,8 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) return -ENODEV; } + slave_dev->phydev->dev_flags |= flags; + return phylink_connect_phy(dp->pl, slave_dev->phydev); } @@ -1804,7 +1853,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ - ret = dsa_slave_phy_connect(slave_dev, dp->index); + ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", @@ -1823,11 +1872,10 @@ void dsa_slave_setup_tagger(struct net_device *slave) struct dsa_slave_priv *p = netdev_priv(slave); const struct dsa_port *cpu_dp = dp->cpu_dp; struct net_device *master = cpu_dp->master; + const struct dsa_switch *ds = dp->ds; - if (cpu_dp->tag_ops->tail_tag) - slave->needed_tailroom = cpu_dp->tag_ops->overhead; - else - slave->needed_headroom = cpu_dp->tag_ops->overhead; + slave->needed_headroom = cpu_dp->tag_ops->needed_headroom; + slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the master) * by also inheriting the master's needed headroom and tailroom. * The 8021q driver also does this. @@ -1836,6 +1884,14 @@ void dsa_slave_setup_tagger(struct net_device *slave) slave->needed_tailroom += master->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; + + slave->features = master->vlan_features | NETIF_F_HW_TC; + slave->hw_features |= NETIF_F_HW_TC; + slave->features |= NETIF_F_LLTX; + if (slave->needed_tailroom) + slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); + if (ds->needs_standalone_vlan_filtering) + slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } static struct lock_class_key dsa_slave_netdev_xmit_lock_key; @@ -1898,11 +1954,6 @@ int dsa_slave_create(struct dsa_port *port) if (slave_dev == NULL) return -ENOMEM; - slave_dev->features = master->vlan_features | NETIF_F_HW_TC; - if (ds->ops->port_vlan_add && ds->ops->port_vlan_del) - slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; - slave_dev->hw_features |= NETIF_F_HW_TC; - slave_dev->features |= NETIF_F_LLTX; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!is_zero_ether_addr(port->mac)) ether_addr_copy(slave_dev->dev_addr, port->mac); @@ -2028,6 +2079,11 @@ static int dsa_slave_changeupper(struct net_device *dev, err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(extack, + "Offloading not supported"); + err = 0; + } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); @@ -2065,6 +2121,22 @@ static int dsa_slave_changeupper(struct net_device *dev, return err; } +static int dsa_slave_prechangeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + if (netif_is_bridge_master(info->upper_dev) && !info->linking) + dsa_port_pre_bridge_leave(dp, info->upper_dev); + else if (netif_is_lag_master(info->upper_dev) && !info->linking) + dsa_port_pre_lag_leave(dp, info->upper_dev); + /* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be + * meaningfully enslaved to a bridge yet + */ + + return NOTIFY_DONE; +} + static int dsa_slave_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) @@ -2091,6 +2163,35 @@ dsa_slave_lag_changeupper(struct net_device *dev, return err; } +/* Same as dsa_slave_lag_changeupper() except that it calls + * dsa_slave_prechangeupper() + */ +static int +dsa_slave_lag_prechangeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *lower; + struct list_head *iter; + int err = NOTIFY_DONE; + struct dsa_port *dp; + + netdev_for_each_lower_dev(dev, lower, iter) { + if (!dsa_slave_dev_check(lower)) + continue; + + dp = dsa_slave_to_port(lower); + if (!dp->lag_dev) + /* Software LAG */ + continue; + + err = dsa_slave_prechangeupper(lower, info); + if (notifier_to_errno(err)) + break; + } + + return err; +} + static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) @@ -2154,6 +2255,32 @@ dsa_slave_check_8021q_upper(struct net_device *dev, return NOTIFY_DONE; } +static int +dsa_slave_prechangeupper_sanity_check(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct dsa_switch *ds; + struct dsa_port *dp; + int err; + + if (!dsa_slave_dev_check(dev)) + return dsa_prevent_bridging_8021q_upper(dev, info); + + dp = dsa_slave_to_port(dev); + ds = dp->ds; + + if (ds->ops->port_prechangeupper) { + err = ds->ops->port_prechangeupper(ds, dp->index, info); + if (err) + return notifier_from_errno(err); + } + + if (is_vlan_dev(info->upper_dev)) + return dsa_slave_check_8021q_upper(dev, info); + + return NOTIFY_DONE; +} + static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -2162,24 +2289,18 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; - struct dsa_switch *ds; - struct dsa_port *dp; int err; - if (!dsa_slave_dev_check(dev)) - return dsa_prevent_bridging_8021q_upper(dev, ptr); + err = dsa_slave_prechangeupper_sanity_check(dev, info); + if (err != NOTIFY_DONE) + return err; - dp = dsa_slave_to_port(dev); - ds = dp->ds; + if (dsa_slave_dev_check(dev)) + return dsa_slave_prechangeupper(dev, ptr); - if (ds->ops->port_prechangeupper) { - err = ds->ops->port_prechangeupper(ds, dp->index, info); - if (err) - return notifier_from_errno(err); - } + if (netif_is_lag_master(dev)) + return dsa_slave_lag_prechangeupper(dev, ptr); - if (is_vlan_dev(info->upper_dev)) - return dsa_slave_check_8021q_upper(dev, ptr); break; } case NETDEV_CHANGEUPPER: @@ -2235,8 +2356,8 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { + struct switchdev_notifier_fdb_info info = {}; struct dsa_switch *ds = switchdev_work->ds; - struct switchdev_notifier_fdb_info info; struct dsa_port *dp; if (!dsa_is_user_port(ds, switchdev_work->port)) @@ -2263,8 +2384,12 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: - err = dsa_port_fdb_add(dp, switchdev_work->addr, - switchdev_work->vid); + if (switchdev_work->host_addr) + err = dsa_port_host_fdb_add(dp, switchdev_work->addr, + switchdev_work->vid); + else + err = dsa_port_fdb_add(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", @@ -2276,8 +2401,12 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) break; case SWITCHDEV_FDB_DEL_TO_DEVICE: - err = dsa_port_fdb_del(dp, switchdev_work->addr, - switchdev_work->vid); + if (switchdev_work->host_addr) + err = dsa_port_host_fdb_del(dp, switchdev_work->addr, + switchdev_work->vid); + else + err = dsa_port_fdb_del(dp, switchdev_work->addr, + switchdev_work->vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", @@ -2289,31 +2418,102 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) } rtnl_unlock(); + dev_put(switchdev_work->dev); kfree(switchdev_work); - if (dsa_is_user_port(ds, dp->index)) - dev_put(dp->slave); } -static int dsa_lower_dev_walk(struct net_device *lower_dev, - struct netdev_nested_priv *priv) +static bool dsa_foreign_dev_check(const struct net_device *dev, + const struct net_device *foreign_dev) { - if (dsa_slave_dev_check(lower_dev)) { - priv->data = (void *)netdev_priv(lower_dev); - return 1; - } + const struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch_tree *dst = dp->ds->dst; - return 0; + if (netif_is_bridge_master(foreign_dev)) + return !dsa_tree_offloads_bridge(dst, foreign_dev); + + if (netif_is_bridge_port(foreign_dev)) + return !dsa_tree_offloads_bridge_port(dst, foreign_dev); + + /* Everything else is foreign */ + return true; } -static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev) +static int dsa_slave_fdb_event(struct net_device *dev, + const struct net_device *orig_dev, + const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info, + unsigned long event) { - struct netdev_nested_priv priv = { - .data = NULL, - }; + struct dsa_switchdev_event_work *switchdev_work; + struct dsa_port *dp = dsa_slave_to_port(dev); + bool host_addr = fdb_info->is_local; + struct dsa_switch *ds = dp->ds; + + if (ctx && ctx != dp) + return 0; - netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv); + if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) + return -EOPNOTSUPP; - return (struct dsa_slave_priv *)priv.data; + if (dsa_slave_dev_check(orig_dev) && + switchdev_fdb_is_dynamically_learned(fdb_info)) + return 0; + + /* FDB entries learned by the software bridge should be installed as + * host addresses only if the driver requests assisted learning. + */ + if (switchdev_fdb_is_dynamically_learned(fdb_info) && + !ds->assisted_learning_on_cpu_port) + return 0; + + /* Also treat FDB entries on foreign interfaces bridged with us as host + * addresses. + */ + if (dsa_foreign_dev_check(dev, orig_dev)) + host_addr = true; + + switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); + if (!switchdev_work) + return -ENOMEM; + + netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", + event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", + orig_dev->name, fdb_info->addr, fdb_info->vid, + host_addr ? " as host address" : ""); + + INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work); + switchdev_work->ds = ds; + switchdev_work->port = dp->index; + switchdev_work->event = event; + switchdev_work->dev = dev; + + ether_addr_copy(switchdev_work->addr, fdb_info->addr); + switchdev_work->vid = fdb_info->vid; + switchdev_work->host_addr = host_addr; + + /* Hold a reference for dsa_fdb_offload_notify */ + dev_hold(dev); + dsa_schedule_work(&switchdev_work->work); + + return 0; +} + +static int +dsa_slave_fdb_add_to_device(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info) +{ + return dsa_slave_fdb_event(dev, orig_dev, ctx, fdb_info, + SWITCHDEV_FDB_ADD_TO_DEVICE); +} + +static int +dsa_slave_fdb_del_to_device(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info) +{ + return dsa_slave_fdb_event(dev, orig_dev, ctx, fdb_info, + SWITCHDEV_FDB_DEL_TO_DEVICE); } /* Called under rcu_read_lock() */ @@ -2321,9 +2521,6 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - const struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_switchdev_event_work *switchdev_work; - struct dsa_port *dp; int err; switch (event) { @@ -2333,69 +2530,19 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, dsa_slave_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: + err = switchdev_handle_fdb_add_to_device(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_fdb_add_to_device, + NULL); + return notifier_from_errno(err); case SWITCHDEV_FDB_DEL_TO_DEVICE: - fdb_info = ptr; - - if (dsa_slave_dev_check(dev)) { - if (!fdb_info->added_by_user || fdb_info->is_local) - return NOTIFY_OK; - - dp = dsa_slave_to_port(dev); - } else { - /* Snoop addresses learnt on foreign interfaces - * bridged with us, for switches that don't - * automatically learn SA from CPU-injected traffic - */ - struct net_device *br_dev; - struct dsa_slave_priv *p; - - br_dev = netdev_master_upper_dev_get_rcu(dev); - if (!br_dev) - return NOTIFY_DONE; - - if (!netif_is_bridge_master(br_dev)) - return NOTIFY_DONE; - - p = dsa_slave_dev_lower_find(br_dev); - if (!p) - return NOTIFY_DONE; - - dp = p->dp->cpu_dp; - - if (!dp->ds->assisted_learning_on_cpu_port) - return NOTIFY_DONE; - - /* When the bridge learns an address on an offloaded - * LAG we don't want to send traffic to the CPU, the - * other ports bridged with the LAG should be able to - * autonomously forward towards it. - */ - if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev)) - return NOTIFY_DONE; - } - - if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del) - return NOTIFY_DONE; - - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); - if (!switchdev_work) - return NOTIFY_BAD; - - INIT_WORK(&switchdev_work->work, - dsa_slave_switchdev_event_work); - switchdev_work->ds = dp->ds; - switchdev_work->port = dp->index; - switchdev_work->event = event; - - ether_addr_copy(switchdev_work->addr, - fdb_info->addr); - switchdev_work->vid = fdb_info->vid; - - /* Hold a reference on the slave for dsa_fdb_offload_notify */ - if (dsa_is_user_port(dp->ds, dp->index)) - dev_hold(dev); - dsa_schedule_work(&switchdev_work->work); - break; + err = switchdev_handle_fdb_del_to_device(dev, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_fdb_del_to_device, + NULL); + return notifier_from_errno(err); default: return NOTIFY_DONE; } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 9bf8e20ecdf3..1c797ec8e2c2 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -52,10 +52,13 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, struct dsa_notifier_mtu_info *info) { - if (ds->index == info->sw_index) - return (port == info->port) || dsa_is_dsa_port(ds, port); + if (ds->index == info->sw_index && port == info->port) + return true; - if (!info->propagate_upstream) + /* Do not propagate to other switches in the tree if the notifier was + * targeted for a single switch. + */ + if (info->targeted_match) return false; if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) @@ -87,38 +90,57 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { struct dsa_switch_tree *dst = ds->dst; + int err; - if (dst->index == info->tree_index && ds->index == info->sw_index && - ds->ops->port_bridge_join) - return ds->ops->port_bridge_join(ds, info->port, info->br); + if (dst->index == info->tree_index && ds->index == info->sw_index) { + if (!ds->ops->port_bridge_join) + return -EOPNOTSUPP; + + err = ds->ops->port_bridge_join(ds, info->port, info->br); + if (err) + return err; + } if ((dst->index != info->tree_index || ds->index != info->sw_index) && - ds->ops->crosschip_bridge_join) - return ds->ops->crosschip_bridge_join(ds, info->tree_index, - info->sw_index, - info->port, info->br); + ds->ops->crosschip_bridge_join) { + err = ds->ops->crosschip_bridge_join(ds, info->tree_index, + info->sw_index, + info->port, info->br); + if (err) + return err; + } - return 0; + return dsa_tag_8021q_bridge_join(ds, info); } static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { - bool unset_vlan_filtering = br_vlan_enabled(info->br); struct dsa_switch_tree *dst = ds->dst; struct netlink_ext_ack extack = {0}; + bool change_vlan_filtering = false; + bool vlan_filtering; int err, port; if (dst->index == info->tree_index && ds->index == info->sw_index && - ds->ops->port_bridge_join) + ds->ops->port_bridge_leave) ds->ops->port_bridge_leave(ds, info->port, info->br); if ((dst->index != info->tree_index || ds->index != info->sw_index) && - ds->ops->crosschip_bridge_join) + ds->ops->crosschip_bridge_leave) ds->ops->crosschip_bridge_leave(ds, info->tree_index, info->sw_index, info->port, info->br); + if (ds->needs_standalone_vlan_filtering && !br_vlan_enabled(info->br)) { + change_vlan_filtering = true; + vlan_filtering = true; + } else if (!ds->needs_standalone_vlan_filtering && + br_vlan_enabled(info->br)) { + change_vlan_filtering = true; + vlan_filtering = false; + } + /* If the bridge was vlan_filtering, the bridge core doesn't trigger an * event for changing vlan_filtering setting upon slave ports leaving * it. That is a good thing, because that lets us handle it and also @@ -127,30 +149,240 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, * vlan_filtering callback is only when the last port leaves the last * VLAN-aware bridge. */ - if (unset_vlan_filtering && ds->vlan_filtering_is_global) { + if (change_vlan_filtering && ds->vlan_filtering_is_global) { for (port = 0; port < ds->num_ports; port++) { struct net_device *bridge_dev; bridge_dev = dsa_to_port(ds, port)->bridge_dev; if (bridge_dev && br_vlan_enabled(bridge_dev)) { - unset_vlan_filtering = false; + change_vlan_filtering = false; break; } } } - if (unset_vlan_filtering) { + + if (change_vlan_filtering) { err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &extack); + vlan_filtering, &extack); if (extack._msg) dev_err(ds->dev, "port %d: %s\n", info->port, extack._msg); if (err && err != EOPNOTSUPP) return err; } + + return dsa_tag_8021q_bridge_leave(ds, info); +} + +/* Matches for all upstream-facing ports (the CPU port and all upstream-facing + * DSA links) that sit between the targeted port on which the notifier was + * emitted and its dedicated CPU port. + */ +static bool dsa_switch_host_address_match(struct dsa_switch *ds, int port, + int info_sw_index, int info_port) +{ + struct dsa_port *targeted_dp, *cpu_dp; + struct dsa_switch *targeted_ds; + + targeted_ds = dsa_switch_find(ds->dst->index, info_sw_index); + targeted_dp = dsa_to_port(targeted_ds, info_port); + cpu_dp = targeted_dp->cpu_dp; + + if (dsa_switch_is_upstream_of(ds, targeted_ds)) + return port == dsa_towards_port(ds, cpu_dp->ds->index, + cpu_dp->index); + + return false; +} + +static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list, + const unsigned char *addr, + u16 vid) +{ + struct dsa_mac_addr *a; + + list_for_each_entry(a, addr_list, list) + if (ether_addr_equal(a->addr, addr) && a->vid == vid) + return a; + + return NULL; +} + +static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_mdb_add(ds, port, mdb); + + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + if (a) { + refcount_inc(&a->refcount); + return 0; + } + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + err = ds->ops->port_mdb_add(ds, port, mdb); + if (err) { + kfree(a); + return err; + } + + ether_addr_copy(a->addr, mdb->addr); + a->vid = mdb->vid; + refcount_set(&a->refcount, 1); + list_add_tail(&a->list, &dp->mdbs); + return 0; } +static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_mdb_del(ds, port, mdb); + + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); + if (!a) + return -ENOENT; + + if (!refcount_dec_and_test(&a->refcount)) + return 0; + + err = ds->ops->port_mdb_del(ds, port, mdb); + if (err) { + refcount_inc(&a->refcount); + return err; + } + + list_del(&a->list); + kfree(a); + + return 0; +} + +static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_fdb_add(ds, port, addr, vid); + + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + if (a) { + refcount_inc(&a->refcount); + return 0; + } + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + err = ds->ops->port_fdb_add(ds, port, addr, vid); + if (err) { + kfree(a); + return err; + } + + ether_addr_copy(a->addr, addr); + a->vid = vid; + refcount_set(&a->refcount, 1); + list_add_tail(&a->list, &dp->fdbs); + + return 0; +} + +static int dsa_switch_do_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->port_fdb_del(ds, port, addr, vid); + + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); + if (!a) + return -ENOENT; + + if (!refcount_dec_and_test(&a->refcount)) + return 0; + + err = ds->ops->port_fdb_del(ds, port, addr, vid); + if (err) { + refcount_inc(&a->refcount); + return err; + } + + list_del(&a->list); + kfree(a); + + return 0; +} + +static int dsa_switch_host_fdb_add(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_fdb_add) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_fdb_add(ds, port, info->addr, + info->vid); + if (err) + break; + } + } + + return err; +} + +static int dsa_switch_host_fdb_del(struct dsa_switch *ds, + struct dsa_notifier_fdb_info *info) +{ + int err = 0; + int port; + + if (!ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_fdb_del(ds, port, info->addr, + info->vid); + if (err) + break; + } + } + + return err; +} + static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { @@ -159,7 +391,7 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds, if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_add(ds, port, info->addr, info->vid); + return dsa_switch_do_fdb_add(ds, port, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, @@ -170,7 +402,7 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds, if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return ds->ops->port_fdb_del(ds, port, info->addr, info->vid); + return dsa_switch_do_fdb_del(ds, port, info->addr, info->vid); } static int dsa_switch_hsr_join(struct dsa_switch *ds, @@ -216,7 +448,7 @@ static int dsa_switch_lag_join(struct dsa_switch *ds, info->port, info->lag, info->info); - return 0; + return -EOPNOTSUPP; } static int dsa_switch_lag_leave(struct dsa_switch *ds, @@ -229,24 +461,34 @@ static int dsa_switch_lag_leave(struct dsa_switch *ds, return ds->ops->crosschip_lag_leave(ds, info->sw_index, info->port, info->lag); - return 0; + return -EOPNOTSUPP; } -static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mdb_info *info) +static int dsa_switch_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) { - if (ds->index == info->sw_index && port == info->port) - return true; + int port = dsa_towards_port(ds, info->sw_index, info->port); - if (dsa_is_dsa_port(ds, port)) - return true; + if (!ds->ops->port_mdb_add) + return -EOPNOTSUPP; - return false; + return dsa_switch_do_mdb_add(ds, port, info->mdb); } -static int dsa_switch_mdb_add(struct dsa_switch *ds, +static int dsa_switch_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + int port = dsa_towards_port(ds, info->sw_index, info->port); + + if (!ds->ops->port_mdb_del) + return -EOPNOTSUPP; + + return dsa_switch_do_mdb_del(ds, port, info->mdb); +} + +static int dsa_switch_host_mdb_add(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) +{ int err = 0; int port; @@ -254,8 +496,9 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, return -EOPNOTSUPP; for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mdb_match(ds, port, info)) { - err = ds->ops->port_mdb_add(ds, port, info->mdb); + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_mdb_add(ds, port, info->mdb); if (err) break; } @@ -264,16 +507,25 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, return err; } -static int dsa_switch_mdb_del(struct dsa_switch *ds, - struct dsa_notifier_mdb_info *info) +static int dsa_switch_host_mdb_del(struct dsa_switch *ds, + struct dsa_notifier_mdb_info *info) { + int err = 0; + int port; + if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - if (ds->index == info->sw_index) - return ds->ops->port_mdb_del(ds, info->port, info->mdb); + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_host_address_match(ds, port, info->sw_index, + info->port)) { + err = dsa_switch_do_mdb_del(ds, port, info->mdb); + if (err) + break; + } + } - return 0; + return err; } static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, @@ -364,36 +616,16 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, return 0; } -static bool dsa_switch_mrp_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mrp_info *info) -{ - if (ds->index == info->sw_index && port == info->port) - return true; - - if (dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_mrp_add(struct dsa_switch *ds, struct dsa_notifier_mrp_info *info) { - int err = 0; - int port; - if (!ds->ops->port_mrp_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mrp_match(ds, port, info)) { - err = ds->ops->port_mrp_add(ds, port, info->mrp); - if (err) - break; - } - } + if (ds->index == info->sw_index) + return ds->ops->port_mrp_add(ds, info->port, info->mrp); - return err; + return 0; } static int dsa_switch_mrp_del(struct dsa_switch *ds, @@ -408,39 +640,18 @@ static int dsa_switch_mrp_del(struct dsa_switch *ds, return 0; } -static bool -dsa_switch_mrp_ring_role_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mrp_ring_role_info *info) -{ - if (ds->index == info->sw_index && port == info->port) - return true; - - if (dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_mrp_add_ring_role(struct dsa_switch *ds, struct dsa_notifier_mrp_ring_role_info *info) { - int err = 0; - int port; - if (!ds->ops->port_mrp_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mrp_ring_role_match(ds, port, info)) { - err = ds->ops->port_mrp_add_ring_role(ds, port, - info->mrp); - if (err) - break; - } - } + if (ds->index == info->sw_index) + return ds->ops->port_mrp_add_ring_role(ds, info->port, + info->mrp); - return err; + return 0; } static int @@ -479,6 +690,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_FDB_DEL: err = dsa_switch_fdb_del(ds, info); break; + case DSA_NOTIFIER_HOST_FDB_ADD: + err = dsa_switch_host_fdb_add(ds, info); + break; + case DSA_NOTIFIER_HOST_FDB_DEL: + err = dsa_switch_host_fdb_del(ds, info); + break; case DSA_NOTIFIER_HSR_JOIN: err = dsa_switch_hsr_join(ds, info); break; @@ -500,6 +717,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MDB_DEL: err = dsa_switch_mdb_del(ds, info); break; + case DSA_NOTIFIER_HOST_MDB_ADD: + err = dsa_switch_host_mdb_add(ds, info); + break; + case DSA_NOTIFIER_HOST_MDB_DEL: + err = dsa_switch_host_mdb_del(ds, info); + break; case DSA_NOTIFIER_VLAN_ADD: err = dsa_switch_vlan_add(ds, info); break; @@ -524,6 +747,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MRP_DEL_RING_ROLE: err = dsa_switch_mrp_del_ring_role(ds, info); break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD: + err = dsa_switch_tag_8021q_vlan_add(ds, info); + break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL: + err = dsa_switch_tag_8021q_vlan_del(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 122ad5833fb1..f8f7b7c34e7d 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -17,7 +17,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | + * | DIR | VBID| SWITCH_ID | VBID | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -27,24 +27,14 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * SVL/SUBVLAN - { VID[9], VID[5:4] }: - * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. - * * 0 (0b000): Field does not encode a sub-VLAN, either because - * received traffic is untagged, PVID-tagged or because a second - * VLAN tag is present after this tag and not inside of it. - * * 1 (0b001): Received traffic is tagged with a VID value private - * to the host. This field encodes the index in the host's lookup - * table through which the value of the ingress VLAN ID can be - * recovered. - * * 2 (0b010): Field encodes a sub-VLAN. - * ... - * * 7 (0b111): Field encodes a sub-VLAN. - * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero - * (by the host) and ignored on receive (by the switch). - * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * + * VBID - { VID[9], VID[5:4] }: + * Virtual bridge ID. If between 1 and 7, packet targets the broadcast + * domain of a bridge. If transmitted as zero, packet targets a single + * port. Field only valid on transmit, must be ignored on receive. + * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -61,23 +51,30 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) -#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 -#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) -#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 -#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(5, 4) -#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) -#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) -#define DSA_8021Q_SUBVLAN(x) \ - (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ - DSA_8021Q_SUBVLAN_LO_MASK) | \ - ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ - DSA_8021Q_SUBVLAN_HI_MASK)) +#define DSA_8021Q_VBID_HI_SHIFT 9 +#define DSA_8021Q_VBID_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_VBID_LO_SHIFT 4 +#define DSA_8021Q_VBID_LO_MASK GENMASK(5, 4) +#define DSA_8021Q_VBID_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_VBID_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_VBID(x) \ + (((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \ + DSA_8021Q_VBID_LO_MASK) | \ + ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \ + DSA_8021Q_VBID_HI_MASK)) #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) +u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num) +{ + /* The VBID value of 0 is reserved for precise TX */ + return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num + 1); +} +EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); + /* Returns the VID to be inserted into the frame from xmit for switch steering * instructions on egress. Encodes switch ID and port ID. */ @@ -98,13 +95,6 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); -u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) -{ - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); - /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { @@ -119,20 +109,6 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); -/* Returns the decoded subvlan from the RX VID. */ -u16 dsa_8021q_rx_subvlan(u16 vid) -{ - u16 svl_hi, svl_lo; - - svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> - DSA_8021Q_SUBVLAN_HI_SHIFT; - svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> - DSA_8021Q_SUBVLAN_LO_SHIFT; - - return (svl_hi << 2) | svl_lo; -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); - bool vid_is_dsa_8021q_rxvlan(u16 vid) { return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; @@ -151,21 +127,152 @@ bool vid_is_dsa_8021q(u16 vid) } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -/* If @enabled is true, installs @vid with @flags into the switch port's HW - * filter. - * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the - * user explicitly configured this @vid through the bridge core, then the @vid - * is installed again, but this time with the flags from the bridge layer. - */ -static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, - u16 flags, bool enabled) +static struct dsa_tag_8021q_vlan * +dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid) +{ + struct dsa_tag_8021q_vlan *v; + + list_for_each_entry(v, &ctx->vlans, list) + if (v->vid == vid && v->port == port) + return v; + + return NULL; +} + +static int dsa_switch_do_tag_8021q_vlan_add(struct dsa_switch *ds, int port, + u16 vid, u16 flags) { - struct dsa_port *dp = dsa_to_port(ctx->ds, port); + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_tag_8021q_vlan *v; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (v) { + refcount_inc(&v->refcount); + return 0; + } + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) + return -ENOMEM; - if (enabled) - return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags); + err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + if (err) { + kfree(v); + return err; + } - return ctx->ops->vlan_del(ctx->ds, dp->index, vid); + v->vid = vid; + v->port = port; + refcount_set(&v->refcount, 1); + list_add_tail(&v->list, &ctx->vlans); + + return 0; +} + +static int dsa_switch_do_tag_8021q_vlan_del(struct dsa_switch *ds, int port, + u16 vid) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_tag_8021q_vlan *v; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_del(ds, port, vid); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (!v) + return -ENOENT; + + if (!refcount_dec_and_test(&v->refcount)) + return 0; + + err = ds->ops->tag_8021q_vlan_del(ds, port, vid); + if (err) { + refcount_inc(&v->refcount); + return err; + } + + list_del(&v->list); + kfree(v); + + return 0; +} + +static bool +dsa_switch_tag_8021q_vlan_match(struct dsa_switch *ds, int port, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + return true; + + if (ds->dst->index == info->tree_index && ds->index == info->sw_index) + return port == info->port; + + return false; +} + +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + int port, err; + + /* Since we use dsa_broadcast(), there might be other switches in other + * trees which don't support tag_8021q, so don't return an error. + * Or they might even support tag_8021q but have not registered yet to + * use it (maybe they use another tagger currently). + */ + if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { + u16 flags = 0; + + if (dsa_is_user_port(ds, port)) + flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (vid_is_dsa_8021q_rxvlan(info->vid) && + dsa_8021q_rx_switch_id(info->vid) == ds->index && + dsa_8021q_rx_source_port(info->vid) == port) + flags |= BRIDGE_VLAN_INFO_PVID; + + err = dsa_switch_do_tag_8021q_vlan_add(ds, port, + info->vid, + flags); + if (err) + return err; + } + } + + return 0; +} + +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + int port, err; + + if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_tag_8021q_vlan_match(ds, port, info)) { + err = dsa_switch_do_tag_8021q_vlan_del(ds, port, + info->vid); + if (err) + return err; + } + } + + return 0; } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -181,12 +288,6 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * force all switched traffic to pass through the CPU. So we must also make * the other front-panel ports members of this VID we're adding, albeit * we're not making it their PVID (they'll still have their own). - * By the way - just because we're installing the same VID in multiple - * switch ports doesn't mean that they'll start to talk to one another, even - * while not bridged: the final forwarding decision is still an AND between - * the L2 forwarding information (which is limiting forwarding in this case) - * and the VLAN-based restrictions (of which there are none in this case, - * since all ports are members). * - On TX (ingress from CPU and towards network) we are faced with a problem. * If we were to tag traffic (from within DSA) with the port's pvid, all * would be well, assuming the switch ports were standalone. Frames would @@ -200,9 +301,10 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * a member of the VID we're tagging the traffic with - the desired one. * * So at the end, each front-panel port will have one RX VID (also the PVID), - * the RX VID of all other front-panel ports, and one TX VID. Whereas the CPU - * port will have the RX and TX VIDs of all front-panel ports, and on top of - * that, is also tagged-input and tagged-output (VLAN trunk). + * the RX VID of all other front-panel ports that are in the same bridge, and + * one TX VID. Whereas the CPU port will have the RX and TX VIDs of all + * front-panel ports, and on top of that, is also tagged-input and + * tagged-output (VLAN trunk). * * CPU port CPU port * +-------------+-----+-------------+ +-------------+-----+-------------+ @@ -220,246 +322,246 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port, - bool enabled) +static bool dsa_tag_8021q_bridge_match(struct dsa_switch *ds, int port, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + + /* Don't match on self */ + if (ds->dst->index == info->tree_index && + ds->index == info->sw_index && + port == info->port) + return false; + + if (dsa_port_is_user(dp)) + return dp->bridge_dev == info->br; + + return false; +} + +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + u16 targeted_rx_vid; + int err, port; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + + for (port = 0; port < ds->num_ports; port++) { + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + if (!dsa_tag_8021q_bridge_match(ds, port, info)) + continue; + + /* Install the RX VID of the targeted port in our VLAN table */ + err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid, true); + if (err) + return err; + + /* Install our RX VID into the targeted port's VLAN table */ + err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid, true); + if (err) + return err; + } + + return 0; +} + +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + u16 targeted_rx_vid; + int port; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_8021q_rx_vid(targeted_ds, info->port); + + for (port = 0; port < ds->num_ports; port++) { + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + if (!dsa_tag_8021q_bridge_match(ds, port, info)) + continue; + + /* Remove the RX VID of the targeted port from our VLAN table */ + dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid, true); + + /* Remove our RX VID from the targeted port's VLAN table */ + dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid, true); + } + + return 0; +} + +int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) { - int upstream = dsa_upstream_port(ctx->ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port); + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid, + true); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload); + +void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid, true); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload); + +/* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ +static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ds, port); struct net_device *master; - int i, err, subvlan; + int err; /* The CPU port is implicitly configured by * configuring the front-panel ports */ - if (!dsa_is_user_port(ctx->ds, port)) + if (!dsa_port_is_user(dp)) return 0; - master = dsa_to_port(ctx->ds, port)->cpu_dp->master; + master = dp->cpu_dp->master; /* Add this user port's RX VID to the membership list of all others * (including itself). This is so that bridging will not be hindered. * L2 forwarding rules still take precedence when there are no VLAN * restrictions, so there are no concerns about leaking traffic. */ - for (i = 0; i < ctx->ds->num_ports; i++) { - u16 flags; - - if (i == upstream) - continue; - else if (i == port) - /* The RX VID is pvid on this port */ - flags = BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_PVID; - else - /* The RX VID is a regular VLAN on all others */ - flags = BRIDGE_VLAN_INFO_UNTAGGED; - - err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); - return err; - } - } - - /* CPU port needs to see this port's RX VID - * as tagged egress. - */ - err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, rx_vid, false); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); + dev_err(ds->dev, + "Failed to apply RX VID %d to port %d: %pe\n", + rx_vid, port, ERR_PTR(err)); return err; } - /* Add to the master's RX filter not only @rx_vid, but in fact - * the entire subvlan range, just in case this DSA switch might - * want to use sub-VLANs. - */ - for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) { - u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan); - - if (enabled) - vlan_vid_add(master, ctx->proto, vid); - else - vlan_vid_del(master, ctx->proto, vid); - } + /* Add @rx_vid to the master's RX filter. */ + vlan_vid_add(master, ctx->proto, rx_vid); /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, - enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, port, err); - return err; - } - err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, tx_vid, false); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, upstream, err); + dev_err(ds->dev, + "Failed to apply TX VID %d on port %d: %pe\n", + tx_vid, port, ERR_PTR(err)); return err; } return err; } -int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) +static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { - int rc, port; + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ds, port); + struct net_device *master; - ASSERT_RTNL(); + /* The CPU port is implicitly configured by + * configuring the front-panel ports + */ + if (!dsa_port_is_user(dp)) + return; - for (port = 0; port < ctx->ds->num_ports; port++) { - rc = dsa_8021q_setup_port(ctx, port, enabled); - if (rc < 0) { - dev_err(ctx->ds->dev, - "Failed to setup VLAN tagging for port %d: %d\n", - port, rc); - return rc; - } - } + master = dp->cpu_dp->master; - return 0; -} -EXPORT_SYMBOL_GPL(dsa_8021q_setup); + dsa_port_tag_8021q_vlan_del(dp, rx_vid, false); -static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx, - int port, - struct dsa_8021q_context *other_ctx, - int other_port, bool enabled) -{ - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); + vlan_vid_del(master, ctx->proto, rx_vid); - /* @rx_vid of local @ds port @port goes to @other_port of - * @other_ds - */ - return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid, - BRIDGE_VLAN_INFO_UNTAGGED, enabled); + dsa_port_tag_8021q_vlan_del(dp, tx_vid, false); } -static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +static int dsa_tag_8021q_setup(struct dsa_switch *ds) { - struct dsa_8021q_crosschip_link *c; + int err, port; - list_for_each_entry(c, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - c->other_port == other_port) { - refcount_inc(&c->refcount); - return 0; + ASSERT_RTNL(); + + for (port = 0; port < ds->num_ports; port++) { + err = dsa_tag_8021q_port_setup(ds, port); + if (err < 0) { + dev_err(ds->dev, + "Failed to setup VLAN tagging for port %d: %pe\n", + port, ERR_PTR(err)); + return err; } } - dev_dbg(ctx->ds->dev, - "adding crosschip link from port %d to %s port %d\n", - port, dev_name(other_ctx->ds->dev), other_port); - - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - - c->port = port; - c->other_ctx = other_ctx; - c->other_port = other_port; - refcount_set(&c->refcount, 1); - - list_add(&c->list, &ctx->crosschip_links); - return 0; } -static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx, - struct dsa_8021q_crosschip_link *c, - bool *keep) +static void dsa_tag_8021q_teardown(struct dsa_switch *ds) { - *keep = !refcount_dec_and_test(&c->refcount); - - if (*keep) - return; + int port; - dev_dbg(ctx->ds->dev, - "deleting crosschip link from port %d to %s port %d\n", - c->port, dev_name(c->other_ctx->ds->dev), c->other_port); + ASSERT_RTNL(); - list_del(&c->list); - kfree(c); + for (port = 0; port < ds->num_ports; port++) + dsa_tag_8021q_port_teardown(ds, port); } -/* Make traffic from local port @port be received by remote port @other_port. - * This means that our @rx_vid needs to be installed on @other_ds's upstream - * and user ports. The user ports should be egress-untagged so that they can - * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged - * or untagged: it doesn't matter, since it should never egress a frame having - * our @rx_vid. - */ -int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) { - /* @other_upstream is how @other_ds reaches us. If we are part - * of disjoint trees, then we are probably connected through - * our CPU ports. If we're part of the same tree though, we should - * probably use dsa_towards_port. - */ - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - int rc; + struct dsa_8021q_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port); - if (rc) - return rc; + ctx->proto = proto; + ctx->ds = ds; - rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_port, true); - if (rc) - return rc; + INIT_LIST_HEAD(&ctx->vlans); - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream); - if (rc) - return rc; + ds->tag_8021q_ctx = ctx; - return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_upstream, true); + return dsa_tag_8021q_setup(ds); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_register); -int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +void dsa_tag_8021q_unregister(struct dsa_switch *ds) { - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - struct dsa_8021q_crosschip_link *c, *n; - - list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - (c->other_port == other_port || - c->other_port == other_upstream)) { - struct dsa_8021q_context *other_ctx = c->other_ctx; - int other_port = c->other_port; - bool keep; - int rc; - - dsa_8021q_crosschip_link_del(ctx, c, &keep); - if (keep) - continue; - - rc = dsa_8021q_crosschip_link_apply(ctx, port, - other_ctx, - other_port, - false); - if (rc) - return rc; - } + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_tag_8021q_vlan *v, *n; + + dsa_tag_8021q_teardown(ds); + + list_for_each_entry_safe(v, n, &ctx->vlans, list) { + list_del(&v->list); + kfree(v); } - return 0; + ds->tag_8021q_ctx = NULL; + + kfree(ctx); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) @@ -471,4 +573,23 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -MODULE_LICENSE("GPL v2"); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) +{ + u16 vid, tci; + + skb_push_rcsum(skb, ETH_HLEN); + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + __vlan_hwaccel_clear_tag(skb); + } else { + __skb_vlan_pop(skb, &tci); + } + skb_pull_rcsum(skb, ETH_HLEN); + + vid = tci & VLAN_VID_MASK; + + *source_port = dsa_8021q_rx_source_port(vid); + *switch_id = dsa_8021q_rx_switch_id(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rcv); diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 002cf7f952e2..8a02ac44282f 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -44,8 +44,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, } static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, - struct net_device *ndev, - struct packet_type *pt) + struct net_device *ndev) { u8 ver, port; u16 hdr; @@ -85,7 +84,7 @@ static const struct dsa_device_ops ar9331_netdev_ops = { .proto = DSA_TAG_PROTO_AR9331, .xmit = ar9331_tag_xmit, .rcv = ar9331_tag_rcv, - .overhead = AR9331_HDR_LEN, + .needed_headroom = AR9331_HDR_LEN, }; MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 40e9f3098c8d..96dbb8ee2fee 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -99,7 +99,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, skb_push(skb, BRCM_TAG_LEN); if (offset) - memmove(skb->data, skb->data + BRCM_TAG_LEN, offset); + dsa_alloc_etype_header(skb, BRCM_TAG_LEN); brcm_tag = skb->data + offset; @@ -136,7 +136,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, */ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, unsigned int offset) { int source_port; @@ -167,7 +166,7 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -182,20 +181,16 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, } -static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; /* skb->data points to the EtherType, the tag is right before it */ - nskb = brcm_tag_rcv_ll(skb, dev, pt, 2); + nskb = brcm_tag_rcv_ll(skb, dev, 2); if (!nskb) return nskb; - /* Move the Ethernet DA and SA */ - memmove(nskb->data - ETH_HLEN, - nskb->data - ETH_HLEN - BRCM_TAG_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, BRCM_TAG_LEN); return nskb; } @@ -205,7 +200,7 @@ static const struct dsa_device_ops brcm_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM, .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_netdev_ops); @@ -233,7 +228,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, skb_push(skb, BRCM_LEG_TAG_LEN); - memmove(skb->data, skb->data + BRCM_LEG_TAG_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN); brcm_tag = skb->data + 2 * ETH_ALEN; @@ -251,8 +246,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, } static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { int source_port; u8 *brcm_tag; @@ -260,7 +254,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) return NULL; - brcm_tag = skb->data - 2; + brcm_tag = dsa_etype_header_pos_rx(skb); source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; @@ -271,12 +265,9 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); - /* Move the Ethernet DA and SA */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - BRCM_LEG_TAG_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, BRCM_LEG_TAG_LEN); return skb; } @@ -286,7 +277,7 @@ static const struct dsa_device_ops brcm_legacy_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_LEGACY, .xmit = brcm_leg_tag_xmit, .rcv = brcm_leg_tag_rcv, - .overhead = BRCM_LEG_TAG_LEN, + .needed_headroom = BRCM_LEG_TAG_LEN, }; DSA_TAG_DRIVER(brcm_legacy_netdev_ops); @@ -302,11 +293,10 @@ static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, } static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { /* tag is prepended to the packet */ - return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); + return brcm_tag_rcv_ll(skb, dev, ETH_HLEN); } static const struct dsa_device_ops brcm_prepend_netdev_ops = { @@ -314,7 +304,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = { .proto = DSA_TAG_PROTO_BRCM_PREPEND, .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, - .overhead = BRCM_TAG_LEN, + .needed_headroom = BRCM_TAG_LEN, }; DSA_TAG_DRIVER(brcm_prepend_netdev_ops); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 7e7b7decdf39..77d0ce89ab77 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -126,18 +126,53 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); + u8 tag_dev, tag_port; + enum dsa_cmd cmd; u8 *dsa_header; + u16 pvid = 0; + int err; + + if (skb->offload_fwd_mark) { + struct dsa_switch_tree *dst = dp->ds->dst; + struct net_device *br = dp->bridge_dev; + + cmd = DSA_CMD_FORWARD; + + /* When offloading forwarding for a bridge, inject FORWARD + * packets on behalf of a virtual switch device with an index + * past the physical switches. + */ + tag_dev = dst->last_switch + 1 + dp->bridge_num; + tag_port = 0; + + /* If we are offloading forwarding for a VLAN-unaware bridge, + * inject packets to hardware using the bridge's pvid, since + * that's where the packets ingressed from. + */ + if (!br_vlan_enabled(br)) { + /* Safe because __dev_queue_xmit() runs under + * rcu_read_lock_bh() + */ + err = br_vlan_get_pvid_rcu(br, &pvid); + if (err) + return NULL; + } + } else { + cmd = DSA_CMD_FROM_CPU; + tag_dev = dp->ds->index; + tag_port = dp->index; + } if (skb->protocol == htons(ETH_P_8021Q)) { if (extra) { skb_push(skb, extra); - memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, extra); } - /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ - dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; - dsa_header[1] = dp->index << 3; + /* Construct tagged DSA tag from 802.1Q tag. */ + dsa_header = dsa_etype_header_pos_tx(skb) + extra; + dsa_header[0] = (cmd << 6) | 0x20 | tag_dev; + dsa_header[1] = tag_port << 3; /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { @@ -146,14 +181,15 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, } } else { skb_push(skb, DSA_HLEN + extra); - memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - - /* Construct untagged FROM_CPU DSA tag. */ - dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; - dsa_header[1] = dp->index << 3; - dsa_header[2] = 0x00; - dsa_header[3] = 0x00; + dsa_alloc_etype_header(skb, DSA_HLEN + extra); + + /* Construct untagged DSA tag. */ + dsa_header = dsa_etype_header_pos_tx(skb) + extra; + + dsa_header[0] = (cmd << 6) | tag_dev; + dsa_header[1] = tag_port << 3; + dsa_header[2] = pvid >> 8; + dsa_header[3] = pvid & 0xff; } return skb; @@ -162,20 +198,18 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { + bool trap = false, trunk = false; int source_device, source_port; - bool trunk = false; enum dsa_code code; enum dsa_cmd cmd; u8 *dsa_header; /* The ethertype field is part of the DSA header. */ - dsa_header = skb->data - 2; + dsa_header = dsa_etype_header_pos_rx(skb); cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - skb->offload_fwd_mark = 1; - trunk = !!(dsa_header[1] & 7); break; @@ -194,7 +228,6 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, * device (like a bridge) that forwarding has * already been done by hardware. */ - skb->offload_fwd_mark = 1; break; case DSA_CODE_MGMT_TRAP: case DSA_CODE_IGMP_MLD_TRAP: @@ -202,6 +235,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, /* Traps have, by definition, not been * forwarded by hardware, so don't mark them. */ + trap = true; break; default: /* Reserved code, this could be anything. Drop @@ -235,6 +269,15 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; + /* When using LAG offload, skb->dev is not a DSA slave interface, + * so we cannot call dsa_default_offload_fwd_mark and we need to + * special-case it. + */ + if (trunk) + skb->offload_fwd_mark = true; + else if (!trap) + dsa_default_offload_fwd_mark(skb); + /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q * tag, and delete the ethertype (extra) if applicable. If the * 'tagged' bit is cleared; delete the DSA tag, and ethertype @@ -269,14 +312,10 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, memcpy(dsa_header, new_header, DSA_HLEN); if (extra) - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - extra, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, extra); } else { skb_pull_rcsum(skb, DSA_HLEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN - extra, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, DSA_HLEN + extra); } return skb; @@ -289,8 +328,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) return dsa_xmit_ll(skb, dev, 0); } -static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev) { if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) return NULL; @@ -303,7 +341,7 @@ static const struct dsa_device_ops dsa_netdev_ops = { .proto = DSA_TAG_PROTO_DSA, .xmit = dsa_xmit, .rcv = dsa_rcv, - .overhead = DSA_HLEN, + .needed_headroom = DSA_HLEN, }; DSA_TAG_DRIVER(dsa_netdev_ops); @@ -322,7 +360,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) if (!skb) return NULL; - edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header = dsa_etype_header_pos_tx(skb); edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; @@ -330,8 +368,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev) { if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) return NULL; @@ -346,7 +383,7 @@ static const struct dsa_device_ops edsa_netdev_ops = { .proto = DSA_TAG_PROTO_EDSA, .xmit = edsa_xmit, .rcv = edsa_rcv, - .overhead = EDSA_HLEN, + .needed_headroom = EDSA_HLEN, }; DSA_TAG_DRIVER(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 2f5bd5e338ab..df7140984da3 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -75,8 +75,7 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, } static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { int port; u8 *gswip_tag; @@ -103,7 +102,7 @@ static const struct dsa_device_ops gswip_netdev_ops = { .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, - .overhead = GSWIP_RX_HEADER_LEN, + .needed_headroom = GSWIP_RX_HEADER_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index a09805c8e1ab..f64b805303cd 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -29,8 +29,7 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, } static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { /* Tag decoding */ u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN; @@ -44,7 +43,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -54,8 +53,7 @@ static const struct dsa_device_ops hellcreek_netdev_ops = { .proto = DSA_TAG_PROTO_HELLCREEK, .xmit = hellcreek_xmit, .rcv = hellcreek_rcv, - .overhead = HELLCREEK_TAG_LEN, - .tail_tag = true, + .needed_tailroom = HELLCREEK_TAG_LEN, }; MODULE_LICENSE("Dual MIT/GPL"); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 4820dbcedfa2..fa1d60d13ad9 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -24,7 +24,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - len); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -53,6 +53,9 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) u8 *tag; u8 *addr; + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + /* Tag encoding */ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); addr = skb_mac_header(skb); @@ -64,8 +67,7 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev) { u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; @@ -77,8 +79,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ8795, .xmit = ksz8795_xmit, .rcv = ksz8795_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz8795_netdev_ops); @@ -115,6 +116,9 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, u8 *addr; u16 val; + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + /* Tag encoding */ tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); addr = skb_mac_header(skb); @@ -129,8 +133,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, return skb; } -static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) { /* Tag decoding */ u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; @@ -149,8 +152,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ9477_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ9477_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9477_netdev_ops); @@ -166,6 +168,9 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, u8 *addr; u8 *tag; + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + /* Tag encoding */ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); addr = skb_mac_header(skb); @@ -183,8 +188,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, - .overhead = KSZ_INGRESS_TAG_LEN, - .tail_tag = true, + .needed_tailroom = KSZ_INGRESS_TAG_LEN, }; DSA_TAG_DRIVER(ksz9893_netdev_ops); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index aa1318dccaf0..cb548188f813 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -62,9 +62,10 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, LAN9303_TAG_LEN); /* make room between MACs and Ether-Type */ - memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, LAN9303_TAG_LEN); + + lan9303_tag = dsa_etype_header_pos_tx(skb); - lan9303_tag = (__be16 *)(skb->data + 2 * ETH_ALEN); tag = lan9303_xmit_use_arl(dp, skb->data) ? LAN9303_TAG_TX_USE_ALR : dp->index | LAN9303_TAG_TX_STP_OVERRIDE; @@ -74,8 +75,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) { __be16 *lan9303_tag; u16 lan9303_tag1; @@ -87,13 +87,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; } - /* '->data' points into the middle of our special VLAN tag information: - * - * ~ MAC src | 0x81 | 0x00 | 0xyy | 0xzz | ether type - * ^ - * ->data - */ - lan9303_tag = (__be16 *)(skb->data - 2); + lan9303_tag = dsa_etype_header_pos_rx(skb); if (lan9303_tag[0] != htons(ETH_P_8021Q)) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); @@ -113,9 +107,11 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, * and the current ethertype field. */ skb_pull_rcsum(skb, 2 + 2); - memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), - 2 * ETH_ALEN); - skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU); + + dsa_strip_etype_header(skb, LAN9303_TAG_LEN); + + if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU)) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -125,7 +121,7 @@ static const struct dsa_device_ops lan9303_netdev_ops = { .proto = DSA_TAG_PROTO_LAN9303, .xmit = lan9303_xmit, .rcv = lan9303_rcv, - .overhead = LAN9303_TAG_LEN, + .needed_headroom = LAN9303_TAG_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index f9b2966d1936..415d8ece242a 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -41,10 +41,10 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, default: xmit_tpid = MTK_HDR_XMIT_UNTAGGED; skb_push(skb, MTK_HDR_LEN); - memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, MTK_HDR_LEN); } - mtk_tag = skb->data + 2 * ETH_ALEN; + mtk_tag = dsa_etype_header_pos_tx(skb); /* Mark tag attribute on special tag insertion to notify hardware * whether that's a combined special tag with 802.1Q header. @@ -61,8 +61,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, return skb; } -static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev) { u16 hdr; int port; @@ -71,19 +70,13 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) return NULL; - /* The MTK header is added by the switch between src addr - * and ethertype at this point, skb->data points to 2 bytes - * after src addr so header should be 2 bytes right before. - */ - phdr = (__be16 *)(skb->data - 2); + phdr = dsa_etype_header_pos_rx(skb); hdr = ntohs(*phdr); /* Remove MTK tag and recalculate checksum. */ skb_pull_rcsum(skb, MTK_HDR_LEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - MTK_HDR_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, MTK_HDR_LEN); /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); @@ -92,7 +85,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -102,7 +95,7 @@ static const struct dsa_device_ops mtk_netdev_ops = { .proto = DSA_TAG_PROTO_MTK, .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, - .overhead = MTK_HDR_LEN, + .needed_headroom = MTK_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 91f0fd1242cd..d37ab98e7fe1 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -55,8 +55,7 @@ static struct sk_buff *seville_xmit(struct sk_buff *skb, } static struct sk_buff *ocelot_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { u64 src_port, qos_class; u64 vlan_tci, tag_type; @@ -104,7 +103,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, */ return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; /* Ocelot switches copy frames unmodified to the CPU. However, it is @@ -143,7 +142,7 @@ static const struct dsa_device_ops ocelot_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; @@ -155,7 +154,7 @@ static const struct dsa_device_ops seville_netdev_ops = { .proto = DSA_TAG_PROTO_SEVILLE, .xmit = seville_xmit, .rcv = ocelot_rcv, - .overhead = OCELOT_TOTAL_TAG_LEN, + .needed_headroom = OCELOT_TOTAL_TAG_LEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 62a93303bd63..3038a257ba05 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -38,32 +38,17 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, } static struct sk_buff *ocelot_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { - int src_port, switch_id, qos_class; - u16 vid, tci; + int src_port, switch_id; - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - - vid = tci & VLAN_VID_MASK; - src_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + dsa_8021q_rcv(skb, &src_port, &switch_id); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; - skb->priority = qos_class; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -73,7 +58,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { .proto = DSA_TAG_PROTO_OCELOT_8021Q, .xmit = ocelot_xmit, .rcv = ocelot_rcv, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .promisc_on_master = true, }; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 88181b52f480..1ea9401b8ace 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -36,8 +36,8 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, QCA_HDR_LEN); - memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); - phdr = (__be16 *)(skb->data + 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, QCA_HDR_LEN); + phdr = dsa_etype_header_pos_tx(skb); /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | @@ -48,8 +48,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) { u8 ver; u16 hdr; @@ -59,11 +58,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; - /* The QCA header is added by the switch between src addr and Ethertype - * At this point, skb->data points to ethertype so header should be - * right before - */ - phdr = (__be16 *)(skb->data - 2); + phdr = dsa_etype_header_pos_rx(skb); hdr = ntohs(*phdr); /* Make sure the version is correct */ @@ -73,8 +68,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, - ETH_HLEN - QCA_HDR_LEN); + dsa_strip_etype_header(skb, QCA_HDR_LEN); /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); @@ -91,7 +85,7 @@ static const struct dsa_device_ops qca_netdev_ops = { .proto = DSA_TAG_PROTO_QCA, .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, - .overhead = QCA_HDR_LEN, + .needed_headroom = QCA_HDR_LEN, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index cf8ac316f4c7..f920487ae145 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -47,16 +47,17 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, dp->index); skb_push(skb, RTL4_A_HDR_LEN); - memmove(skb->data, skb->data + RTL4_A_HDR_LEN, 2 * ETH_ALEN); - tag = skb->data + 2 * ETH_ALEN; + dsa_alloc_etype_header(skb, RTL4_A_HDR_LEN); + tag = dsa_etype_header_pos_tx(skb); /* Set Ethertype */ p = (__be16 *)tag; *p = htons(RTL4_A_ETHERTYPE); - out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); - /* The lower bits is the port number */ - out |= (u8)dp->index; + out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT) | (2 << 8); + /* The lower bits indicate the port number */ + out |= BIT(dp->index); + p = (__be16 *)(tag + 2); *p = htons(out); @@ -64,8 +65,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, } static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { u16 protport; __be16 *p; @@ -77,12 +77,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) return NULL; - /* The RTL4 header has its own custom Ethertype 0x8899 and that - * starts right at the beginning of the packet, after the src - * ethernet addr. Apparently skb->data always points 2 bytes in, - * behind the Ethertype. - */ - tag = skb->data - 2; + tag = dsa_etype_header_pos_rx(skb); p = (__be16 *)tag; etype = ntohs(*p); if (etype != RTL4_A_ETHERTYPE) { @@ -109,12 +104,9 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, /* Remove RTL4 tag and recalculate checksum */ skb_pull_rcsum(skb, RTL4_A_HDR_LEN); - /* Move ethernet DA and SA in front of the data */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - RTL4_A_HDR_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, RTL4_A_HDR_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -124,7 +116,7 @@ static const struct dsa_device_ops rtl4a_netdev_ops = { .proto = DSA_TAG_PROTO_RTL4_A, .xmit = rtl4a_tag_xmit, .rcv = rtl4a_tag_rcv, - .overhead = RTL4_A_HDR_LEN, + .needed_headroom = RTL4_A_HDR_LEN, }; module_dsa_tag_driver(rtl4a_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 50496013cdb7..c054f48541c8 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -7,6 +7,52 @@ #include <linux/packing.h> #include "dsa_priv.h" +/* Is this a TX or an RX header? */ +#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15) + +/* RX header */ +#define SJA1110_RX_HEADER_IS_METADATA BIT(14) +#define SJA1110_RX_HEADER_HOST_ONLY BIT(13) +#define SJA1110_RX_HEADER_HAS_TRAILER BIT(12) + +/* Trap-to-host format (no trailer present) */ +#define SJA1110_RX_HEADER_SRC_PORT(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_HEADER_SWITCH_ID(x) ((x) & GENMASK(3, 0)) + +/* Timestamp format (trailer present) */ +#define SJA1110_RX_HEADER_TRAILER_POS(x) ((x) & GENMASK(11, 0)) + +#define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4) +#define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0)) + +/* Meta frame format (for 2-step TX timestamps) */ +#define SJA1110_RX_HEADER_N_TS(x) (((x) & GENMASK(8, 4)) >> 4) + +/* TX header */ +#define SJA1110_TX_HEADER_UPDATE_TC BIT(14) +#define SJA1110_TX_HEADER_TAKE_TS BIT(13) +#define SJA1110_TX_HEADER_TAKE_TS_CASC BIT(12) +#define SJA1110_TX_HEADER_HAS_TRAILER BIT(11) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */ +#define SJA1110_TX_HEADER_PRIO(x) (((x) << 7) & GENMASK(10, 7)) +#define SJA1110_TX_HEADER_TSTAMP_ID(x) ((x) & GENMASK(7, 0)) + +/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */ +#define SJA1110_TX_HEADER_TRAILER_POS(x) ((x) & GENMASK(10, 0)) + +#define SJA1110_TX_TRAILER_TSTAMP_ID(x) (((x) << 24) & GENMASK(31, 24)) +#define SJA1110_TX_TRAILER_PRIO(x) (((x) << 21) & GENMASK(23, 21)) +#define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12)) +#define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1)) + +#define SJA1110_META_TSTAMP_SIZE 10 + +#define SJA1110_HEADER_LEN 4 +#define SJA1110_RX_TRAILER_LEN 13 +#define SJA1110_TX_TRAILER_LEN 4 +#define SJA1110_MAX_PADDING_LEN 15 + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -69,56 +115,117 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } -static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) { - struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); - u16 vlan_tci; + struct sja1105_port *sp = dp->priv; - if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) - return true; + if (!dsa_port_is_sja1105(dp)) + return skb; - if (hdr->h_vlan_proto != htons(ETH_P_8021Q) && - !skb_vlan_tag_present(skb)) - return false; - - if (skb_vlan_tag_present(skb)) - vlan_tci = skb_vlan_tag_get(skb); - else - vlan_tci = ntohs(hdr->h_vlan_TCI); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); - return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK); + return NULL; } -/* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it. +/* Send VLAN tags with a TPID that blends in with whatever VLAN protocol a + * bridge spanning ports of this switch might have. */ -static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) +static u16 sja1105_xmit_tpid(struct dsa_port *dp) { - if (sja1105_can_use_vlan_as_tags(skb)) - return true; - if (sja1105_is_link_local(skb)) - return true; - if (sja1105_is_meta_frame(skb)) - return true; - return false; + struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; + u16 proto; + + /* Since VLAN awareness is global, then if this port is VLAN-unaware, + * all ports are. Use the VLAN-unaware TPID used for tag_8021q. + */ + if (!dsa_port_is_vlan_filtering(dp)) + return ETH_P_SJA1105; + + /* Port is VLAN-aware, so there is a bridge somewhere (a single one, + * we're sure about that). It may not be on this port though, so we + * need to find it. + */ + list_for_each_entry(other_dp, &ds->dst->ports, list) { + if (other_dp->ds != ds) + continue; + + if (!other_dp->bridge_dev) + continue; + + /* Error is returned only if CONFIG_BRIDGE_VLAN_FILTERING, + * which seems pointless to handle, as our port cannot become + * VLAN-aware in that case. + */ + br_vlan_get_proto(other_dp->bridge_dev, &proto); + + return proto; + } + + WARN_ONCE(1, "Port is VLAN-aware but cannot find associated bridge!\n"); + + return ETH_P_SJA1105; } -/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ -static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, - struct sk_buff *skb) +static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb, + struct net_device *netdev) { - /* Increase refcount so the kfree_skb in dsa_slave_xmit - * won't really free the packet. + struct dsa_port *dp = dsa_slave_to_port(netdev); + struct net_device *br = dp->bridge_dev; + u16 tx_vid; + + /* If the port is under a VLAN-aware bridge, just slide the + * VLAN-tagged packet into the FDB and hope for the best. + * This works because we support a single VLAN-aware bridge + * across the entire dst, and its VLANs cannot be shared with + * any standalone port. */ - skb_queue_tail(&sp->xmit_queue, skb_get(skb)); - kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + if (br_vlan_enabled(br)) + return skb; - return NULL; + /* If the port is under a VLAN-unaware bridge, use an imprecise + * TX VLAN that targets the bridge's entire broadcast domain, + * instead of just the specific port. + */ + tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(dp->bridge_num); + + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid); } -static u16 sja1105_xmit_tpid(struct sja1105_port *sp) +/* Transform untagged control packets into pvid-tagged control packets so that + * all packets sent by this tagger are VLAN-tagged and we can configure the + * switch to drop untagged packets coming from the DSA master. + */ +static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp, + struct sk_buff *skb, u8 pcp) { - return sp->xmit_tpid; + __be16 xmit_tpid = htons(sja1105_xmit_tpid(dp)); + struct vlan_ethhdr *hdr; + + /* If VLAN tag is in hwaccel area, move it to the payload + * to deal with both cases uniformly and to ensure that + * the VLANs are added in the right order. + */ + if (unlikely(skb_vlan_tag_present(skb))) { + skb = __vlan_hwaccel_push_inside(skb); + if (!skb) + return NULL; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + + /* If skb is already VLAN-tagged, leave that VLAN ID in place */ + if (hdr->h_vlan_proto == xmit_tpid) + return skb; + + return vlan_insert_tag(skb, xmit_tpid, (pcp << VLAN_PRIO_SHIFT) | + SJA1105_DEFAULT_VLAN); } static struct sk_buff *sja1105_xmit(struct sk_buff *skb, @@ -129,17 +236,78 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); + /* Transmitting management traffic does not rely upon switch tagging, * but instead SPI-installed management routes. Part 2 of this * is the .port_deferred_xmit driver callback. */ - if (unlikely(sja1105_is_link_local(skb))) - return sja1105_defer_xmit(dp->priv, skb); + if (unlikely(sja1105_is_link_local(skb))) { + skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp); + if (!skb) + return NULL; - return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), + return sja1105_defer_xmit(dp, skb); + } + + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } +static struct sk_buff *sja1110_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; + struct dsa_port *dp = dsa_slave_to_port(netdev); + u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); + u16 queue_mapping = skb_get_queue_mapping(skb); + u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + __be32 *tx_trailer; + __be16 *tx_header; + int trailer_pos; + + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); + + /* Transmitting control packets is done using in-band control + * extensions, while data packets are transmitted using + * tag_8021q TX VLANs. + */ + if (likely(!sja1105_is_link_local(skb))) + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); + + skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp); + if (!skb) + return NULL; + + skb_push(skb, SJA1110_HEADER_LEN); + + dsa_alloc_etype_header(skb, SJA1110_HEADER_LEN); + + trailer_pos = skb->len; + + tx_header = dsa_etype_header_pos_tx(skb); + tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN); + + tx_header[0] = htons(ETH_P_SJA1110); + tx_header[1] = htons(SJA1110_HEADER_HOST_TO_SWITCH | + SJA1110_TX_HEADER_HAS_TRAILER | + SJA1110_TX_HEADER_TRAILER_POS(trailer_pos)); + *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) | + SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) | + SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index))); + if (clone) { + u8 ts_id = SJA1105_SKB_CB(clone)->ts_id; + + tx_header[1] |= htons(SJA1110_TX_HEADER_TAKE_TS); + *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id)); + } + + return skb; +} + static void sja1105_transfer_meta(struct sk_buff *skb, const struct sja1105_meta *meta) { @@ -147,7 +315,7 @@ static void sja1105_transfer_meta(struct sk_buff *skb, hdr->h_dest[3] = meta->dmac_byte_3; hdr->h_dest[4] = meta->dmac_byte_4; - SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp; + SJA1105_SKB_CB(skb)->tstamp = meta->tstamp; } /* This is a simple state machine which follows the hardware mechanism of @@ -176,16 +344,16 @@ static struct sk_buff bool is_link_local, bool is_meta) { - struct sja1105_port *sp; - struct dsa_port *dp; - - dp = dsa_slave_to_port(skb->dev); - sp = dp->priv; - /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ if (is_link_local) { + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; + + if (unlikely(!dsa_port_is_sja1105(dp))) + return skb; + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) /* Do normal processing. */ return skb; @@ -218,8 +386,13 @@ static struct sk_buff * frame, which serves no further purpose). */ } else if (is_meta) { + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; struct sk_buff *stampable_skb; + if (unlikely(!dsa_port_is_sja1105(dp))) + return skb; + /* Drop the meta frame if we're not in the right state * to process it. */ @@ -261,60 +434,58 @@ static struct sk_buff return skb; } -static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) +static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) { - struct dsa_port *dp = dsa_slave_to_port(skb->dev); - struct sja1105_port *sp = dp->priv; - u16 vid = sp->subvlan_map[subvlan]; + u16 tpid = ntohs(eth_hdr(skb)->h_proto); + + return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || + skb_vlan_tag_present(skb); +} + +static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) +{ + return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110; +} + +/* If the VLAN in the packet is a tag_8021q one, set @source_port and + * @switch_id and strip the header. Otherwise set @vid and keep it in the + * packet. + */ +static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, + int *switch_id, u16 *vid) +{ + struct vlan_ethhdr *hdr = (struct vlan_ethhdr *)skb_mac_header(skb); u16 vlan_tci; - if (vid == VLAN_N_VID) - return; + if (skb_vlan_tag_present(skb)) + vlan_tci = skb_vlan_tag_get(skb); + else + vlan_tci = ntohs(hdr->h_vlan_TCI); + + if (vid_is_dsa_8021q_rxvlan(vlan_tci & VLAN_VID_MASK)) + return dsa_8021q_rcv(skb, source_port, switch_id); - vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); + /* Try our best with imprecise RX */ + *vid = vlan_tci & VLAN_VID_MASK; } static struct sk_buff *sja1105_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { + int source_port = -1, switch_id = -1; struct sja1105_meta meta = {0}; - int source_port, switch_id; struct ethhdr *hdr; - u16 tpid, vid, tci; bool is_link_local; - u16 subvlan = 0; - bool is_tagged; bool is_meta; + u16 vid; hdr = eth_hdr(skb); - tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q || - skb_vlan_tag_present(skb)); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); - skb->offload_fwd_mark = 1; - - if (is_tagged) { + if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - skb_push_rcsum(skb, ETH_HLEN); - if (skb_vlan_tag_present(skb)) { - tci = skb_vlan_tag_get(skb); - __vlan_hwaccel_clear_tag(skb); - } else { - __skb_vlan_pop(skb, &tci); - } - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - vid = tci & VLAN_VID_MASK; - source_port = dsa_8021q_rx_source_port(vid); - switch_id = dsa_8021q_rx_switch_id(vid); - skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - subvlan = dsa_8021q_rx_subvlan(vid); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -333,19 +504,157 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); if (!skb->dev) { netdev_warn(netdev, "Couldn't decode source port\n"); return NULL; } - if (subvlan) - sja1105_decode_subvlan(skb, subvlan); + if (!is_link_local) + dsa_default_offload_fwd_mark(skb); return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } +static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) +{ + u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN; + int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + int n_ts = SJA1110_RX_HEADER_N_TS(rx_header); + struct net_device *master = skb->dev; + struct dsa_port *cpu_dp; + struct dsa_switch *ds; + int i; + + cpu_dp = master->dsa_ptr; + ds = dsa_switch_find(cpu_dp->dst->index, switch_id); + if (!ds) { + net_err_ratelimited("%s: cannot find switch id %d\n", + master->name, switch_id); + return NULL; + } + + for (i = 0; i <= n_ts; i++) { + u8 ts_id, source_port, dir; + u64 tstamp; + + ts_id = buf[0]; + source_port = (buf[1] & GENMASK(7, 4)) >> 4; + dir = (buf[1] & BIT(3)) >> 3; + tstamp = be64_to_cpu(*(__be64 *)(buf + 2)); + + sja1110_process_meta_tstamp(ds, source_port, ts_id, dir, + tstamp); + + buf += SJA1110_META_TSTAMP_SIZE; + } + + /* Discard the meta frame, we've consumed the timestamps it contained */ + return NULL; +} + +static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, + int *source_port, + int *switch_id, + bool *host_only) +{ + u16 rx_header; + + if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN))) + return NULL; + + /* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly + * what we need because the caller has checked the EtherType (which is + * located 2 bytes back) and we just need a pointer to the header that + * comes afterwards. + */ + rx_header = ntohs(*(__be16 *)skb->data); + + if (rx_header & SJA1110_RX_HEADER_HOST_ONLY) + *host_only = true; + + if (rx_header & SJA1110_RX_HEADER_IS_METADATA) + return sja1110_rcv_meta(skb, rx_header); + + /* Timestamp frame, we have a trailer */ + if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) { + int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header); + u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN; + u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp; + u8 last_byte = rx_trailer[12]; + + /* The timestamp is unaligned, so we need to use packing() + * to get it + */ + packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0); + + *source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte); + *switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte); + + /* skb->len counts from skb->data, while start_of_padding + * counts from the destination MAC address. Right now skb->data + * is still as set by the DSA master, so to trim away the + * padding and trailer we need to account for the fact that + * skb->data points to skb_mac_header(skb) + ETH_HLEN. + */ + pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN); + /* Trap-to-host frame, no timestamp trailer */ + } else { + *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header); + *switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); + } + + /* Advance skb->data past the DSA header */ + skb_pull_rcsum(skb, SJA1110_HEADER_LEN); + + dsa_strip_etype_header(skb, SJA1110_HEADER_LEN); + + /* With skb->data in its final place, update the MAC header + * so that eth_hdr() continues to works properly. + */ + skb_set_mac_header(skb, -ETH_HLEN); + + return skb; +} + +static struct sk_buff *sja1110_rcv(struct sk_buff *skb, + struct net_device *netdev) +{ + int source_port = -1, switch_id = -1; + bool host_only = false; + u16 vid = 0; + + if (sja1110_skb_has_inband_control_extension(skb)) { + skb = sja1110_rcv_inband_control_extension(skb, &source_port, + &switch_id, + &host_only); + if (!skb) + return NULL; + } + + /* Packets with in-band control extensions might still have RX VLANs */ + if (likely(sja1105_skb_has_tag_8021q(skb))) + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); + + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (!skb->dev) { + netdev_warn(netdev, "Couldn't decode source port\n"); + return NULL; + } + + if (!host_only) + dsa_default_offload_fwd_mark(skb); + + return skb; +} + static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, int *offset) { @@ -356,18 +665,51 @@ static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto, dsa_tag_generic_flow_dissect(skb, proto, offset); } +static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + /* Management frames have 2 DSA tags on RX, so the needed_headroom we + * declared is fine for the generic dissector adjustment procedure. + */ + if (unlikely(sja1105_is_link_local(skb))) + return dsa_tag_generic_flow_dissect(skb, proto, offset); + + /* For the rest, there is a single DSA tag, the tag_8021q one */ + *offset = VLAN_HLEN; + *proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1]; +} + static const struct dsa_device_ops sja1105_netdev_ops = { .name = "sja1105", .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, .rcv = sja1105_rcv, - .filter = sja1105_filter, - .overhead = VLAN_HLEN, + .needed_headroom = VLAN_HLEN, .flow_dissect = sja1105_flow_dissect, .promisc_on_master = true, }; -MODULE_LICENSE("GPL v2"); +DSA_TAG_DRIVER(sja1105_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105); -module_dsa_tag_driver(sja1105_netdev_ops); +static const struct dsa_device_ops sja1110_netdev_ops = { + .name = "sja1110", + .proto = DSA_TAG_PROTO_SJA1110, + .xmit = sja1110_xmit, + .rcv = sja1110_rcv, + .flow_dissect = sja1110_flow_dissect, + .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN, + .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN, +}; + +DSA_TAG_DRIVER(sja1110_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110); + +static struct dsa_tag_driver *sja1105_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops), + &DSA_TAG_DRIVER_NAME(sja1110_netdev_ops), +}; + +module_dsa_tag_drivers(sja1105_tag_driver_array); + +MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5b97ede56a0f..5749ba85c2b8 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -24,8 +24,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev) { u8 *trailer; int source_port; @@ -55,8 +54,7 @@ static const struct dsa_device_ops trailer_netdev_ops = { .proto = DSA_TAG_PROTO_TRAILER, .xmit = trailer_xmit, .rcv = trailer_rcv, - .overhead = 4, - .tail_tag = true, + .needed_tailroom = 4, }; MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index 858cdf9d2913..ff442b8af636 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -25,8 +25,7 @@ static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev) { int source_port; u8 *trailer; @@ -46,7 +45,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; /* Frame is forwarded by hardware, don't forward in software. */ - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -56,8 +55,7 @@ static const struct dsa_device_ops xrs700x_netdev_ops = { .proto = DSA_TAG_PROTO_XRS700X, .xmit = xrs700x_xmit, .rcv = xrs700x_rcv, - .overhead = 1, - .tail_tag = true, + .needed_tailroom = 1, }; MODULE_LICENSE("GPL"); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9cce612e8976..73fce9467467 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -62,8 +62,6 @@ #include <linux/uaccess.h> #include <net/pkt_sched.h> -__setup("ether=", netdev_boot_setup); - /** * eth_header - create the Ethernet header * @skb: buffer to alter @@ -182,12 +180,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. - * The DSA tagging protocol may be able to decode some but not all - * traffic (for example only for management). In that case give it the - * option to filter the packets from which it can decode source port - * information. */ - if (unlikely(netdev_uses_dsa(dev)) && dsa_can_decode(skb, dev)) + if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 723c9a8a8cdf..0a19470efbfb 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 1d6bc132aa4d..46776ea42a92 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -10,6 +10,7 @@ struct coalesce_req_info { struct coalesce_reply_data { struct ethnl_reply_data base; struct ethtool_coalesce coalesce; + struct kernel_ethtool_coalesce kernel_coalesce; u32 supported_params; }; @@ -61,6 +62,7 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -70,7 +72,8 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, + &data->kernel_coalesce, extack); ethnl_ops_complete(dev); return ret; @@ -100,7 +103,9 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ - nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */ + nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ + nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ + nla_total_size(sizeof(u8)); /* _USE_CQE_MODE_RX */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -124,6 +129,7 @@ static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; @@ -170,7 +176,11 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, coal->tx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, - coal->rate_sample_interval, supported)) + coal->rate_sample_interval, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, + kcoal->use_cqe_mode_tx, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, + kcoal->use_cqe_mode_rx, supported)) return -EMSGSIZE; return 0; @@ -215,10 +225,13 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), }; int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) { + struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; @@ -255,7 +268,8 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = ops->get_coalesce(dev, &coalesce); + ret = ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) goto out_ops; @@ -303,11 +317,16 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); + ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx, + tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod); + ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, + tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod); ret = 0; if (!mod) goto out_ops; - ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) goto out_ops; ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index f9dcbad84788..c63e0739dc6a 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -4,6 +4,7 @@ #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/rtnetlink.h> +#include <linux/ptp_clock_kernel.h> #include "common.h" @@ -397,6 +398,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats", [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo", [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", + [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); @@ -554,6 +556,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) return 0; } +int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) +{ + struct ethtool_ts_info info = { }; + int num = 0; + + if (!__ethtool_get_ts_info(dev, &info)) + num = ptp_get_vclocks_index(info.phc_index, vclock_index); + + return num; +} +EXPORT_SYMBOL(ethtool_get_phc_vclocks); + const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 5d38e90895ac..7e6b37a54add 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -159,9 +159,6 @@ static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr * request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); - if (!request->length) - return -EINVAL; - /* The following set of conditions limit the API to only dump 1/2 * EEPROM page without crossing low page boundary located at offset 128. * This means user may only request dumps of length limited to 128 from @@ -180,10 +177,6 @@ static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr * NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross half page boundary is illegal"); return -EINVAL; - } else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) { - NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET], - "offset is out of bounds"); - return -EINVAL; } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross page boundary is illegal"); @@ -236,8 +229,10 @@ const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), - [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, - [ETHTOOL_A_MODULE_EEPROM_LENGTH] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_OFFSET] = + NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1), + [ETHTOOL_A_MODULE_EEPROM_LENGTH] = + NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index baa5d10043cb..f2abc3152888 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -7,6 +7,7 @@ * the information ethtool needs. */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> @@ -23,6 +24,7 @@ #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> +#include <linux/pm_runtime.h> #include <net/devlink.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> @@ -807,6 +809,120 @@ out: return ret; } +static noinline_for_stack int +ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, + const struct compat_ethtool_rxnfc __user *useraddr, + size_t size) +{ + struct compat_ethtool_rxnfc crxnfc = {}; + + /* We expect there to be holes between fs.m_ext and + * fs.ring_cookie and at the end of fs, but nowhere else. + * On non-x86, no conversion should be needed. + */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && + sizeof(struct compat_ethtool_rxnfc) != + sizeof(struct ethtool_rxnfc)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + + sizeof(useraddr->fs.m_ext) != + offsetof(struct ethtool_rxnfc, fs.m_ext) + + sizeof(rxnfc->fs.m_ext)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) + return -EFAULT; + + *rxnfc = (struct ethtool_rxnfc) { + .cmd = crxnfc.cmd, + .flow_type = crxnfc.flow_type, + .data = crxnfc.data, + .fs = { + .flow_type = crxnfc.fs.flow_type, + .h_u = crxnfc.fs.h_u, + .h_ext = crxnfc.fs.h_ext, + .m_u = crxnfc.fs.m_u, + .m_ext = crxnfc.fs.m_ext, + .ring_cookie = crxnfc.fs.ring_cookie, + .location = crxnfc.fs.location, + }, + .rule_cnt = crxnfc.rule_cnt, + }; + + return 0; +} + +static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, + const void __user *useraddr, + size_t size) +{ + if (compat_need_64bit_alignment_fixup()) + return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); + + if (copy_from_user(rxnfc, useraddr, size)) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + struct compat_ethtool_rxnfc crxnfc; + + memset(&crxnfc, 0, sizeof(crxnfc)); + crxnfc = (struct compat_ethtool_rxnfc) { + .cmd = rxnfc->cmd, + .flow_type = rxnfc->flow_type, + .data = rxnfc->data, + .fs = { + .flow_type = rxnfc->fs.flow_type, + .h_u = rxnfc->fs.h_u, + .h_ext = rxnfc->fs.h_ext, + .m_u = rxnfc->fs.m_u, + .m_ext = rxnfc->fs.m_ext, + .ring_cookie = rxnfc->fs.ring_cookie, + .location = rxnfc->fs.location, + }, + .rule_cnt = rxnfc->rule_cnt, + }; + + if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_user(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + int ret; + + if (compat_need_64bit_alignment_fixup()) { + ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, + rule_buf); + useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); + } else { + ret = copy_to_user(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + } + + if (ret) + return -EFAULT; + + if (rule_buf) { + if (copy_to_user(useraddr, rule_buf, + rxnfc->rule_cnt * sizeof(u32))) + return -EFAULT; + } + + return 0; +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -825,7 +941,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; rc = dev->ethtool_ops->set_rxnfc(dev, &info); @@ -833,7 +949,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; if (cmd == ETHTOOL_SRXCLSRLINS && - copy_to_user(useraddr, &info, info_size)) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; @@ -859,7 +975,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* If FLOW_RSS was requested then user-space must be using the @@ -867,7 +983,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, */ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { info_size = sizeof(info); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. @@ -893,18 +1009,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (ret < 0) goto err_out; - ret = -EFAULT; - if (copy_to_user(useraddr, &info, info_size)) - goto err_out; - - if (rule_buf) { - useraddr += offsetof(struct ethtool_rxnfc, rule_locs); - if (copy_to_user(useraddr, rule_buf, - info.rule_cnt * sizeof(u32))) - goto err_out; - } - ret = 0; - + ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); @@ -1514,12 +1619,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + struct kernel_ethtool_coalesce kernel_coalesce = {}; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; - ret = dev->ethtool_ops->get_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); if (ret) return ret; @@ -1586,19 +1693,26 @@ ethtool_set_coalesce_supported(struct net_device *dev, static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { + struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce; int ret; - if (!dev->ethtool_ops->set_coalesce) + if (!dev->ethtool_ops->set_coalesce && !dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); + if (ret) + return ret; + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; - ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); return ret; @@ -2581,15 +2695,14 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) /* The main entry point in this file. Called from net/core/dev_ioctl.c */ -int dev_ethtool(struct net *net, struct ifreq *ifr) +int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - void __user *useraddr = ifr->ifr_data; u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; - if (!dev || !netif_device_present(dev)) + if (!dev) return -ENODEV; if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) @@ -2645,10 +2758,18 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return -EPERM; } + if (dev->dev.parent) + pm_runtime_get_sync(dev->dev.parent); + + if (!netif_device_present(dev)) { + rc = -ENODEV; + goto out; + } + if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); - if (rc < 0) - return rc; + if (rc < 0) + goto out; } old_features = dev->features; @@ -2867,6 +2988,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (old_features != dev->features) netdev_features_change(dev); +out: + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); return rc; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88d8a0243f35..1797a0a90019 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -2,6 +2,7 @@ #include <net/sock.h> #include <linux/ethtool_netlink.h> +#include <linux/pm_runtime.h> #include "netlink.h" static struct genl_family ethtool_genl_family; @@ -29,6 +30,44 @@ const struct nla_policy ethnl_header_policy_stats[] = { ETHTOOL_FLAGS_STATS), }; +int ethnl_ops_begin(struct net_device *dev) +{ + int ret; + + if (!dev) + return -ENODEV; + + if (dev->dev.parent) + pm_runtime_get_sync(dev->dev.parent); + + if (!netif_device_present(dev)) { + ret = -ENODEV; + goto err; + } + + if (dev->ethtool_ops->begin) { + ret = dev->ethtool_ops->begin(dev); + if (ret) + goto err; + } + + return 0; +err: + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); + + return ret; +} + +void ethnl_ops_complete(struct net_device *dev) +{ + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); +} + /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into @@ -101,12 +140,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, return -EINVAL; } - if (dev && !netif_device_present(dev)) { - dev_put(dev); - NL_SET_ERR_MSG(extack, "device not present"); - return -ENODEV; - } - req_info->dev = dev; req_info->flags = flags; return 0; @@ -248,6 +281,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, + [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -315,9 +349,9 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; + int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; - int reply_len; int ret; ops = ethnl_default_requests[cmd]; @@ -346,21 +380,25 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; - reply_len = ret + ethnl_reply_header_size(); + reply_len = ret; ret = -ENOMEM; - rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd, + rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), + req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; + hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; + WARN_ONCE(rskb->len - hdr_len > reply_len, + "ethnl cmd %d: calculated reply length %d, but consumed %d\n", + cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); - if (req_info->dev) - dev_put(req_info->dev); + dev_put(req_info->dev); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); @@ -372,8 +410,7 @@ err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: - if (req_info->dev) - dev_put(req_info->dev); + dev_put(req_info->dev); kfree(reply_data); kfree(req_info); return ret; @@ -953,6 +990,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_phc_vclocks_get_policy, + .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 8abcbc10796c..e8987e28036f 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -138,7 +138,7 @@ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, } /** - * ethnl_update_binary() - update binary data from NLA_BINARY atribute + * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null @@ -247,19 +247,8 @@ struct ethnl_reply_data { struct net_device *dev; }; -static inline int ethnl_ops_begin(struct net_device *dev) -{ - if (dev && dev->ethtool_ops->begin) - return dev->ethtool_ops->begin(dev); - else - return 0; -} - -static inline void ethnl_ops_complete(struct net_device *dev) -{ - if (dev && dev->ethtool_ops->complete) - dev->ethtool_ops->complete(dev); -} +int ethnl_ops_begin(struct net_device *dev); +void ethnl_ops_complete(struct net_device *dev); /** * struct ethnl_request_ops - unified handling of GET requests @@ -347,6 +336,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; +extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -369,7 +359,7 @@ extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; -extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1]; +extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; @@ -380,8 +370,9 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; -extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; +extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; +extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c new file mode 100644 index 000000000000..637b2f5297d5 --- /dev/null +++ b/net/ethtool/phc_vclocks.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 NXP + */ +#include "netlink.h" +#include "common.h" + +struct phc_vclocks_req_info { + struct ethnl_req_info base; +}; + +struct phc_vclocks_reply_data { + struct ethnl_reply_data base; + int num; + int *index; +}; + +#define PHC_VCLOCKS_REPDATA(__reply_base) \ + container_of(__reply_base, struct phc_vclocks_reply_data, base) + +const struct nla_policy ethnl_phc_vclocks_get_policy[] = { + [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->num = ethtool_get_phc_vclocks(dev, &data->index); + ethnl_ops_complete(dev); + + return ret; +} + +static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + int len = 0; + + if (data->num > 0) { + len += nla_total_size(sizeof(u32)); + len += nla_total_size(sizeof(s32) * data->num); + } + + return len; +} + +static int phc_vclocks_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + + if (data->num <= 0) + return 0; + + if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || + nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, + sizeof(s32) * data->num, data->index)) + return -EMSGSIZE; + + return 0; +} + +static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + + kfree(data->index); +} + +const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { + .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, + .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, + .req_info_size = sizeof(struct phc_vclocks_req_info), + .reply_data_size = sizeof(struct phc_vclocks_reply_data), + + .prepare_data = phc_vclocks_prepare_data, + .reply_size = phc_vclocks_reply_size, + .fill_reply = phc_vclocks_fill_reply, + .cleanup_data = phc_vclocks_cleanup_data, +}; diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index bb1351c38397..e31949479305 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -397,7 +397,8 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, * ensures entries of restarted nodes gets pruned so that they can * re-register and resume communications. */ - if (seq_nr_before(sequence_nr, node->seq_out[port->type])) + if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + seq_nr_before(sequence_nr, node->seq_out[port->type])) return; node->time_in[port->type] = jiffies; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 88215b5c93aa..dd5a45f8a78a 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -340,8 +340,7 @@ nla_put_failure: out_dev: wpan_phy_put(phy); out: - if (dev) - dev_put(dev); + dev_put(dev); return rc; } diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 0cf2374c143b..277124f206e0 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -2226,8 +2226,7 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { struct wpan_dev *wpan_dev = info->user_ptr[1]; - if (wpan_dev->netdev) - dev_put(wpan_dev->netdev); + dev_put(wpan_dev->netdev); } else { dev_put(info->user_ptr[1]); } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a45a0401adc5..7bb9ef35c570 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -41,8 +41,7 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: @@ -129,7 +128,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, int ret = -ENOIOCTLCMD; struct net_device *dev; - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; @@ -143,7 +142,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); @@ -984,6 +983,11 @@ static const struct proto_ops ieee802154_dgram_ops = { .sendpage = sock_no_sendpage, }; +static void ieee802154_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + /* Create a socket. Initialise the socket, blank the addresses * set the state. */ @@ -1024,7 +1028,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, sock->ops = ops; sock_init_data(sock, sk); - /* FIXME: sk->sk_destruct */ + sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2f94d221c00e..1d816a5fd3eb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -318,7 +318,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); - err = -ENOBUFS; + err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); @@ -953,10 +953,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); - if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq))) + if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; @@ -966,7 +966,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; @@ -1720,7 +1720,6 @@ EXPORT_SYMBOL_GPL(snmp_fold_field64); #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, - .netns_ok = 1, }; #endif @@ -1733,7 +1732,6 @@ static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; @@ -1746,14 +1744,12 @@ static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, - .netns_ok = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 36ed85bf2ad5..6eea1e9e998d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -450,6 +450,7 @@ static int ah4_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; + break; case ICMP_REDIRECT: break; default: @@ -554,7 +555,6 @@ static int ah4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type ah_type = { - .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9e41eff4a685..0dcee9df1326 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -10,6 +10,9 @@ #include <net/tcp.h> #include <net/bpf_sk_storage.h> +/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), offsetof(struct tcp_congestion_ops, release), @@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 prog_ops_moff(const struct bpf_prog *prog) +{ + const struct btf_member *m; + const struct btf_type *t; + u32 midx; + + midx = prog->expected_attach_type; + t = bpf_tcp_congestion_ops.type; + m = &btf_type_member(t)[midx]; + + return btf_member_bit_offset(t, m) / 8; +} + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + /* Does not allow release() to call setsockopt. + * release() is called when the current bpf-tcp-cc + * is retiring. It is not allowed to call + * setsockopt() to make further changes which + * may potentially allocate new resources. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + /* Since get/setsockopt is usually expected to + * be available together, disable getsockopt for + * release also to avoid usage surprise. + * The bpf-tcp-cc already has a more powerful way + * to read tcp_sock from the PTR_TO_BTF_ID. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_getsockopt_proto; + return NULL; default: return bpf_base_func_proto(func_id); } @@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } -/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; - struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e0480c6cebaa..099259fc826a 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -187,8 +187,7 @@ static int __init cipso_v4_cache_init(void) * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: - * Invalidates and frees any entries in the CIPSO cache. Returns zero on - * success and negative values on failure. + * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 1c6429c353a9..f4468980b675 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -215,7 +215,7 @@ static void devinet_sysctl_unregister(struct in_device *idev) static struct in_ifaddr *inet_alloc_ifa(void) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); + return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT); } static void inet_rcu_free_ifa(struct rcu_head *head) @@ -1243,7 +1243,7 @@ out: return ret; } -static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) +int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); const struct in_ifaddr *ifa; @@ -1950,16 +1950,17 @@ static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { }; static int inet_validate_link_af(const struct net_device *dev, - const struct nlattr *nla) + const struct nlattr *nla, + struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rcu(dev)) + if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, - inet_af_policy, NULL); + inet_af_policy, extack); if (err < 0) return err; @@ -1981,7 +1982,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { - struct in_device *in_dev = __in_dev_get_rcu(dev); + struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; @@ -2424,11 +2425,15 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + struct net *net = ctl->extra2; + int ret; - if (write && *valp != val) { - struct net *net = ctl->extra2; + if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_trylock()) { /* Restore the original values before restarting */ @@ -2762,8 +2767,6 @@ void __init devinet_init(void) INIT_HLIST_HEAD(&inet_addr_lst[i]); register_pernet_subsys(&devinet_ops); - - register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 35803ab7ac80..851f542928a3 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -97,7 +97,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, static void esp_ssg_unref(struct xfrm_state *x, void *tmp) { - struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; int extralen = 0; u8 *iv; @@ -105,9 +104,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) struct scatterlist *sg; if (x->props.flags & XFRM_STATE_ESN) - extralen += sizeof(*extra); + extralen += sizeof(struct esp_output_extra); - extra = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); @@ -673,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -982,6 +980,7 @@ static int esp4_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; + break; case ICMP_REDIRECT: break; default: @@ -1198,7 +1197,6 @@ static int esp4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type esp_type = { - .description = "ESP4", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 33687cf58286..8e4e9aa12130 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -33,12 +33,11 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, struct xfrm_state *x; __be32 seq; __be32 spi; - int err; if (!pskb_pull(skb, offset)) return NULL; - if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); @@ -343,7 +342,6 @@ static const struct net_offload esp4_offload = { }; static const struct xfrm_type_offload esp_type_offload = { - .description = "ESP4 OFFLOAD", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..9fe13e4f5d08 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -371,6 +371,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; + } else { + swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) @@ -1122,10 +1124,8 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); - /* Add network specific broadcasts, when it takes a sense */ + /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, - prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); } @@ -1376,7 +1376,7 @@ static void nl_fib_input(struct sk_buff *skb) portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); + nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) @@ -1516,6 +1516,12 @@ static int __net_init ip_fib_net_init(struct net *net) if (err) return err; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* Default to 3-tuple */ + net->ipv4.sysctl_fib_multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; +#endif + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b58db1ca4bfb..e184bcb19943 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,7 +25,7 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 -/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +/* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index a632b66bc13a..b42c429cebbe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -208,9 +208,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) void fib_nh_common_release(struct fib_nh_common *nhc) { - if (nhc->nhc_dev) - dev_put(nhc->nhc_dev); - + dev_put(nhc->nhc_dev); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); @@ -260,7 +258,7 @@ EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { spin_lock_bh(&fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -1373,7 +1371,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { - fi->fib_treeref++; + refcount_inc(&fi->fib_treeref); return fi; } } @@ -1547,11 +1545,11 @@ link_it: if (ofi) { fi->fib_dead = 1; free_fib_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock_bh(&fib_info_lock); hlist_add_head(&fi->fib_hash, @@ -1874,6 +1872,7 @@ static int call_fib_nh_notifiers(struct fib_nh *nh, (nh->fib_nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); + break; default: break; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 25cf387cca5b..8060524f4256 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2380,11 +2380,11 @@ void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index e5f69b0bf3df..8fcbc6258ec5 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -230,8 +230,8 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { + const struct net_offload __rcu **offloads; u8 proto = fou_from_sock(sk)->protocol; - const struct net_offload **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; @@ -263,10 +263,10 @@ out_unlock: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload *ops; + const struct net_offload __rcu **offloads; u8 proto = fou_from_sock(sk)->protocol; + const struct net_offload *ops; int err = -ENOSYS; - const struct net_offload **offloads; rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; @@ -311,7 +311,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload **offloads; + const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -457,8 +457,8 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5d1e6fe9d838..cbb2b4bb0dfa 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -195,7 +195,6 @@ static int gre_err(struct sk_buff *skb, u32 info) static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, - .netns_ok = 1, }; static int __init gre_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 752e392083e6..8b30cadff708 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -993,14 +993,8 @@ static bool icmp_redirect(struct sk_buff *skb) static bool icmp_echo(struct sk_buff *skb) { - struct icmp_ext_hdr *ext_hdr, _ext_hdr; - struct icmp_ext_echo_iio *iio, _iio; struct icmp_bxm icmp_param; - struct net_device *dev; - char buff[IFNAMSIZ]; struct net *net; - u16 ident_len; - u8 status; net = dev_net(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ @@ -1013,20 +1007,46 @@ static bool icmp_echo(struct sk_buff *skb) icmp_param.data_len = skb->len; icmp_param.head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) { + if (icmp_param.data.icmph.type == ICMP_ECHO) icmp_param.data.icmph.type = ICMP_ECHOREPLY; - goto send_reply; - } - if (!net->ipv4.sysctl_icmp_echo_enable_probe) + else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) return true; + + icmp_reply(&icmp_param, skb); + return true; +} + +/* Helper for icmp_echo and icmpv6_echo_reply. + * Searches for net_device that matches PROBE interface identifier + * and builds PROBE reply message in icmphdr. + * + * Returns false if PROBE responses are disabled via sysctl + */ + +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) +{ + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct net *net = dev_net(skb->dev); + struct net_device *dev; + char buff[IFNAMSIZ]; + u16 ident_len; + u8 status; + + if (!net->ipv4.sysctl_icmp_echo_enable_probe) + return false; + /* We currently only support probing interfaces on the proxy node * Check to ensure L-bit is set */ - if (!(ntohs(icmp_param.data.icmph.un.echo.sequence) & 1)) - return true; + if (!(ntohs(icmphdr->un.echo.sequence) & 1)) + return false; /* Clear status bits in reply message */ - icmp_param.data.icmph.un.echo.sequence &= htons(0xFF00); - icmp_param.data.icmph.type = ICMP_EXT_ECHOREPLY; + icmphdr->un.echo.sequence &= htons(0xFF00); + if (icmphdr->type == ICMP_EXT_ECHO) + icmphdr->type = ICMP_EXT_ECHOREPLY; + else + icmphdr->type = ICMPV6_EXT_ECHO_REPLY; ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); /* Size of iio is class_type dependent. * Only check header here and assign length based on ctype in the switch statement @@ -1066,7 +1086,7 @@ static bool icmp_echo(struct sk_buff *skb) if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + sizeof(struct in_addr)) goto send_mal_query; - dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: @@ -1075,8 +1095,7 @@ static bool icmp_echo(struct sk_buff *skb) sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); - if (dev) - dev_hold(dev); + dev_hold(dev); break; #endif default: @@ -1087,8 +1106,8 @@ static bool icmp_echo(struct sk_buff *skb) goto send_mal_query; } if (!dev) { - icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_NO_IF; + return true; } /* Fill bits in reply message */ if (dev->flags & IFF_UP) @@ -1098,14 +1117,13 @@ static bool icmp_echo(struct sk_buff *skb) if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) status |= ICMP_EXT_ECHOREPLY_IPV6; dev_put(dev); - icmp_param.data.icmph.un.echo.sequence |= htons(status); -send_reply: - icmp_reply(&icmp_param, skb); - return true; + icmphdr->un.echo.sequence |= htons(status); + return true; send_mal_query: - icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_MAL_QUERY; + return true; } +EXPORT_SYMBOL_GPL(icmp_build_probe); /* * Handle ICMP Timestamp requests. diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6b3c558a4f23..d2e2b3d18c66 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -803,10 +803,17 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); + u32 mr_ifc_count; igmpv3_send_cr(in_dev); - if (in_dev->mr_ifc_count) { - in_dev->mr_ifc_count--; +restart: + mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); + + if (mr_ifc_count) { + if (cmpxchg(&in_dev->mr_ifc_count, + mr_ifc_count, + mr_ifc_count - 1) != mr_ifc_count) + goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } @@ -818,7 +825,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); igmp_ifc_start_timer(in_dev, 1); } @@ -957,7 +964,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qri; } /* cancel the interface change timer */ - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ @@ -1724,7 +1731,7 @@ void ip_mc_down(struct in_device *in_dev) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; @@ -1941,7 +1948,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); @@ -2120,7 +2127,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); @@ -2233,7 +2240,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } @@ -2382,7 +2389,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -2393,7 +2401,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); @@ -2468,19 +2477,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) goto done; } if (msf->imsf_numsrc) { - newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc), - GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, + msf->imsf_numsrc), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; - memcpy(newpsl->sl_addr, msf->imsf_slist, - msf->imsf_numsrc * sizeof(msf->imsf_slist[0])); + memcpy(newpsl->sl_addr, msf->imsf_slist_flex, + flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { - sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); + sock_kfree_s(sk, newpsl, + struct_size(newpsl, sl_addr, + newpsl->sl_max)); goto done; } } else { @@ -2493,7 +2505,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, @@ -2551,14 +2564,14 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; - len = copycount * sizeof(psl->sl_addr[0]); + len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && - copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len)) + copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len)) return -EFAULT; return 0; done: @@ -2713,6 +2726,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { + spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; @@ -2723,6 +2737,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; + spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd472eae4f5c..f25d02ad4a8a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk, bool relax, bool reuseport_ok) { struct sock *sk2; + bool reuseport_cb_ok; bool reuse = sk->sk_reuse; bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + rcu_read_lock(); + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); + rcu_read_unlock(); + /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed @@ -156,14 +164,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, if ((!relax || (!reuseport_ok && reuseport && sk2->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && + reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))) && inet_rcv_saddr_equal(sk, sk2, true)) break; } else if (!reuseport_ok || !reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || + !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2)))) { if (inet_rcv_saddr_equal(sk, sk2, true)) @@ -526,7 +534,8 @@ out: atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg && amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, + GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } @@ -687,6 +696,66 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock *inet_reqsk_clone(struct request_sock *req, + struct sock *sk) +{ + struct sock *req_sk, *nreq_sk; + struct request_sock *nreq; + + nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!nreq) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ + sock_put(sk); + return NULL; + } + + req_sk = req_to_sk(req); + nreq_sk = req_to_sk(nreq); + + memcpy(nreq_sk, req_sk, + offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + + sk_node_init(&nreq_sk->sk_node); + nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; +#ifdef CONFIG_XPS + nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; +#endif + nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; + + nreq->rsk_listener = sk; + + /* We need not acquire fastopenq->lock + * because the child socket is locked in inet_csk_listen_stop(). + */ + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) + rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); + + return nreq; +} + +static void reqsk_queue_migrated(struct request_sock_queue *queue, + const struct request_sock *req) +{ + if (req->num_timeout == 0) + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); +} + +static void reqsk_migrate_reset(struct request_sock *req) +{ + req->saved_syn = NULL; +#if IS_ENABLED(CONFIG_IPV6) + inet_rsk(req)->ipv6_opt = NULL; + inet_rsk(req)->pktopts = NULL; +#else + inet_rsk(req)->ireq_opt = NULL; +#endif +} + /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { @@ -727,15 +796,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; - struct net *net = sock_net(sk_listener); - struct inet_connection_sock *icsk = inet_csk(sk_listener); - struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) - goto drop; + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { + struct sock *nsk; + + nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); + if (!nsk) + goto drop; + nreq = inet_reqsk_clone(req, nsk); + if (!nreq) + goto drop; + + /* The new timer for the cloned req can decrease the 2 + * by calling inet_csk_reqsk_queue_drop_and_put(), so + * hold another count to prevent use-after-free and + * call reqsk_put() just before return. + */ + refcount_set(&nreq->rsk_refcnt, 2 + 1); + timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); + + req = nreq; + sk_listener = nsk; + } + + icsk = inet_csk(sk_listener); + net = sock_net(sk_listener); max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -754,6 +847,7 @@ static void reqsk_timer_handler(struct timer_list *t) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ + queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; @@ -778,10 +872,39 @@ static void reqsk_timer_handler(struct timer_list *t) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer(&req->rsk_timer, jiffies + timeo); + + if (!nreq) + return; + + if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { + /* delete timer */ + inet_csk_reqsk_queue_drop(sk_listener, nreq); + goto no_ownership; + } + + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(oreq); + reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); + reqsk_put(oreq); + + reqsk_put(nreq); return; } + + /* Even if we can clone the req, we may need not retransmit any more + * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another + * CPU may win the "own_req" race so that inet_ehash_insert() fails. + */ + if (nreq) { + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); +no_ownership: + reqsk_migrate_reset(nreq); + reqsk_queue_removed(queue, nreq); + __reqsk_free(nreq); + } + drop: - inet_csk_reqsk_queue_drop_and_put(sk_listener, req); + inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, @@ -997,12 +1120,42 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; @@ -1028,14 +1181,40 @@ void inet_csk_listen_stop(struct sock *sk) * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { - struct sock *child = req->sk; + struct sock *child = req->sk, *nsk; + struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); + nsk = reuseport_migrate_sock(sk, child, NULL); + if (nsk) { + nreq = inet_reqsk_clone(req, nsk); + if (nreq) { + refcount_set(&nreq->rsk_refcnt, 1); + + if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + } else { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } + + /* inet_csk_reqsk_queue_add() has already + * called inet_child_forget() on failure case. + */ + goto skip_child_forget; + } + } + inet_child_forget(sk, req, child); +skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 93474b1bea4e..ef7897226f08 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - u16 nlmsg_flags) + u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; @@ -444,6 +444,12 @@ static int inet_twsk_diag_fill(struct sock *sk, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + nlmsg_end(skb, nlh); return 0; } @@ -494,7 +500,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); @@ -574,10 +580,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, nlmsg_free(rep); goto out; } - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) @@ -801,6 +804,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = sk->sk_mark; else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c96866a53a66..80aeaf9e6e16 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -697,7 +697,7 @@ void inet_unhash(struct sock *sk) goto unlock; if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + reuseport_stop_listen_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); ilb->count--; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a68bf4c6fe9b..0fe6c936dc54 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -107,6 +107,8 @@ module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; +static const struct header_ops ipgre_header_ops; + static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, @@ -364,7 +366,10 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, raw_proto, false) < 0) goto drop; - if (tunnel->dev->type != ARPHRD_NONE) + /* Special case for ipgre_header_parse(), which expects the + * mac_header to point to the outer IP header. + */ + if (tunnel->dev->header_ops == &ipgre_header_ops) skb_pop_mac_header(skb); else skb_reset_mac_header(skb); @@ -625,15 +630,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + const int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; + if (pull_len > skb_transport_offset(skb)) + goto free_skb; + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing * to gre header. */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + skb_pull(skb, pull_len); skb_reset_mac_header(skb); } else { if (skb_cow_head(skb, dev->needed_headroom)) @@ -918,7 +928,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c3efc7d658f6..9bca57ef8b83 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,19 +198,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); - /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); - if (!skb2) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } if (lwtunnel_xmit_redirect(dst->lwtstate)) { @@ -446,8 +437,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ @@ -614,18 +606,6 @@ void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, } EXPORT_SYMBOL(ip_fraglist_init); -static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, - struct ip_fraglist_iter *iter) -{ - struct sk_buff *to = iter->frag; - - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(skb)->flags; - - if (iter->offset == 0) - ip_options_fragment(to); -} - void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { unsigned int hlen = iter->hlen; @@ -671,7 +651,7 @@ void ip_frag_init(struct sk_buff *skb, unsigned int hlen, EXPORT_SYMBOL(ip_frag_init); static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, - bool first_frag, struct ip_frag_state *state) + bool first_frag) { /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -846,11 +826,14 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, /* Everything is OK. Generate! */ ip_fraglist_init(skb, iph, hlen, &iter); + if (iter.frag) + ip_options_fragment(iter.frag); + for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) { - ip_fraglist_ipcb_prepare(skb, &iter); + IPCB(iter.frag)->flags = IPCB(skb)->flags; ip_fraglist_prepare(skb, &iter); } @@ -905,7 +888,7 @@ slow_path: err = PTR_ERR(skb2); goto fail; } - ip_frag_ipcb(skb, skb2, first_frag, &state); + ip_frag_ipcb(skb, skb2, first_frag); /* * Put this fragment into the sending queue. @@ -1054,7 +1037,7 @@ static int __ip_append_data(struct sock *sk, unsigned int datalen; unsigned int fraglen; unsigned int fraggap; - unsigned int alloclen; + unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: @@ -1074,35 +1057,39 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; pagedlen = 0; + alloc_extra = hh_len + 15; + alloc_extra += exthdrlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length + fraggap) + alloc_extra += rt->dst.trailer_len; + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else if (!paged) + else if (!paged && + (fraglen + alloc_extra < SKB_MAX_ALLOC || + !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } - alloclen += exthdrlen; - - /* The last fragment gets additional space at tail. - * Note, with MSG_MORE we overallocate on fragments, - * because we have no idea what fragment will be - * the last. - */ - if (datalen == length + fraggap) - alloclen += rt->dst.trailer_len; + alloclen += alloc_extra; if (transhdrlen) { - skb = sock_alloc_send_skb(sk, - alloclen + hh_len + 15, + skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = alloc_skb(alloclen + hh_len + 15, + skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ec6036713e2c..b297bb28556e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -663,12 +663,11 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex, struct sockaddr_storage *group, struct sockaddr_storage *list) { - int msize = IP_MSFILTER_SIZE(numsrc); struct ip_msfilter *msf; struct sockaddr_in *psin; int err, i; - msf = kmalloc(msize, GFP_KERNEL); + msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL); if (!msf) return -ENOBUFS; @@ -684,7 +683,7 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex, if (psin->sin_family != AF_INET) goto Eaddrnotavail; - msf->imsf_slist[i] = psin->sin_addr.s_addr; + msf->imsf_slist_flex[i] = psin->sin_addr.s_addr; } err = ip_mc_msfilter(sk, msf, ifindex); kfree(msf); @@ -791,7 +790,8 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) goto out_free_gsf; err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, - gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist); + gsf->gf_fmode, &gsf->gf_group, + gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return err; @@ -800,7 +800,7 @@ out_free_gsf: static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; unsigned int n; void *p; @@ -814,7 +814,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ err = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) @@ -827,7 +827,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, goto out_free_gsf; err = -EINVAL; - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ @@ -835,7 +835,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, - &gf32->gf_group, gf32->gf_slist); + &gf32->gf_group, gf32->gf_slist_flex); out_free_gsf: kfree(p); return err; @@ -1456,7 +1456,7 @@ static bool getsockopt_needs_rtnl(int optname) static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct group_filter, gf_slist); + const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter __user *p = optval; struct group_filter gsf; int num; @@ -1468,7 +1468,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, return -EFAULT; num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex); if (err) return err; if (gsf.gf_numsrc < num) @@ -1482,7 +1482,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; @@ -1499,7 +1499,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; - err = ip_mc_gsfget(sk, &gf, p->gf_slist); + err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex); if (err) return err; if (gf.gf_numsrc < num) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index f6cc26de5ed3..fe9101d3d69e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -317,7 +317,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) } dev->needed_headroom = t_hlen + hlen; - mtu -= t_hlen; + mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); if (mtu < IPV4_MIN_MTU) mtu = IPV4_MIN_MTU; @@ -348,6 +348,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->max_mtu -= dev->hard_header_len; + ip_tunnel_add(itn, nt); return nt; @@ -387,7 +390,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } - skb_reset_network_header(skb); + skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -489,11 +492,14 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; pkt_size = skb->len - tunnel_hlen; + pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; - if (df) + if (df) { mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); - else + mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; + } else { mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + } if (skb_valid_dst(skb)) skb_dst_update_pmtu_no_confirm(skb, mtu); @@ -952,19 +958,20 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); -int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (!err && copy_to_user(data, &p, sizeof(p))) return -EFAULT; return err; } -EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); +EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { @@ -972,6 +979,9 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = IP_MAX_MTU - t_hlen; + if (dev->type == ARPHRD_ETHER) + max_mtu -= dev->hard_header_len; + if (new_mtu < ETH_MIN_MTU) return -EINVAL; @@ -1149,6 +1159,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], if (tb[IFLA_MTU]) { unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); + if (dev->type == ARPHRD_ETHER) + max -= dev->hard_header_len; + mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 31c6c6d99d5e..efe25a0172e6 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -351,6 +351,7 @@ static int vti4_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; + break; case ICMP_REDIRECT: break; default: @@ -404,7 +405,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index b42683212c65..366094c1ce6c 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -31,6 +31,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; + break; case ICMP_REDIRECT: break; default: @@ -152,7 +153,6 @@ static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ipcomp_type = { - .description = "IPCOMP4", .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp4_init_state, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index d5bfa087c23a..3aa78ccbec3e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -242,6 +242,8 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (!tun_dst) return 0; } + skb_reset_mac_header(skb); + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } @@ -345,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 939792a38814..2dda856ca260 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1317,7 +1317,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } /* called from ip_ra_control(), before an RCU grace period, - * we dont need to call synchronize_rcu() here + * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { @@ -1938,7 +1938,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; - /* For an (*,G) entry, we only check that the incomming + /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); @@ -2119,7 +2119,7 @@ int ip_mr_input(struct sk_buff *skb) raw_rcv(mroute_sk, skb); return 0; } - } + } } /* already under rcu_read_lock() */ @@ -3007,7 +3007,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = { #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6922612df456..3de78416ec76 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -18,15 +18,12 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) -static int __net_init arptable_filter_table_init(struct net *net); - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, - .table_init = arptable_filter_table_init, }; /* The work comes in here from netfilter.c */ @@ -39,7 +36,7 @@ arptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_table_init(struct net *net) +static int arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; int err; @@ -69,30 +66,32 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + arptable_filter_table_init); + + if (ret < 0) + return ret; arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) + if (IS_ERR(arpfilter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(arpfilter_ops); + } ret = register_pernet_subsys(&arptable_filter_net_ops); if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(arpfilter_ops); return ret; } - ret = arptable_filter_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&arptable_filter_net_ops); - kfree(arpfilter_ops); - } - return ret; } static void __exit arptable_filter_fini(void) { unregister_pernet_subsys(&arptable_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 8f7ca67475b7..8fd1aba8af31 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -66,11 +66,22 @@ struct clusterip_net { /* lock protects the configs list */ spinlock_t lock; + bool clusterip_deprecated_warning; #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; /* mutex protects the config->pde*/ struct mutex mutex; #endif + unsigned int hook_users; +}; + +static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); + +static const struct nf_hook_ops cip_arp_ops = { + .hook = clusterip_arp_mangle, + .pf = NFPROTO_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 }; static unsigned int clusterip_net_id __read_mostly; @@ -458,6 +469,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + struct clusterip_net *cn = clusterip_pernet(par->net); const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; int ret, i; @@ -467,6 +479,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return -EOPNOTSUPP; } + if (cn->hook_users == UINT_MAX) + return -EOVERFLOW; + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { @@ -517,10 +532,23 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return ret; } - if (!par->net->xt.clusterip_deprecated_warning) { + if (cn->hook_users == 0) { + ret = nf_register_net_hook(par->net, &cip_arp_ops); + + if (ret < 0) { + clusterip_config_entry_put(config); + clusterip_config_put(config); + nf_ct_netns_put(par->net, par->family); + return ret; + } + } + + cn->hook_users++; + + if (!cn->clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " "use xt_cluster instead\n"); - par->net->xt.clusterip_deprecated_warning = true; + cn->clusterip_deprecated_warning = true; } cipinfo->config = config; @@ -531,6 +559,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + struct clusterip_net *cn = clusterip_pernet(par->net); /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ @@ -539,6 +568,10 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); nf_ct_netns_put(par->net, par->family); + cn->hook_users--; + + if (cn->hook_users == 0) + nf_unregister_net_hook(par->net, &cip_arp_ops); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -602,9 +635,8 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) +clusterip_arp_mangle(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; @@ -654,13 +686,6 @@ arp_mangle(void *priv, return NF_ACCEPT; } -static const struct nf_hook_ops cip_arp_ops = { - .hook = arp_mangle, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = -1 -}; - /*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ @@ -817,20 +842,14 @@ static const struct proc_ops clusterip_proc_ops = { static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); - int ret; INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); - ret = nf_register_net_hook(net, &cip_arp_ops); - if (ret < 0) - return ret; - #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { - nf_unregister_net_hook(net, &cip_arp_ops); pr_err("Unable to proc dir entry\n"); return -ENOMEM; } @@ -850,7 +869,6 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; mutex_unlock(&cn->mutex); #endif - nf_unregister_net_hook(net, &cip_arp_ops); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8272df7c6ad5..0eb0e2ab9bfc 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -19,7 +19,6 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -27,7 +26,6 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, - .table_init = iptable_filter_table_init, }; static unsigned int @@ -43,7 +41,7 @@ static struct nf_hook_ops *filter_ops __read_mostly; static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_table_init(struct net *net) +static int iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; int err; @@ -62,7 +60,7 @@ static int __net_init iptable_filter_table_init(struct net *net) static int __net_init iptable_filter_net_init(struct net *net) { - if (net == &init_net || !forward) + if (!forward) return iptable_filter_table_init(net); return 0; @@ -86,22 +84,32 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + iptable_filter_table_init); + + if (ret < 0) + return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) + if (IS_ERR(filter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); + } ret = register_pernet_subsys(&iptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(filter_ops); + return ret; + } - return ret; + return 0; } static void __exit iptable_filter_fini(void) { unregister_pernet_subsys(&iptable_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 2abc3836f391..40417a3f930b 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static int __net_init iptable_mangle_table_init(struct net *net); - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, - .table_init = iptable_mangle_table_init, }; static unsigned int @@ -83,7 +80,7 @@ iptable_mangle_hook(void *priv, } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init iptable_mangle_table_init(struct net *net) +static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -113,32 +110,32 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret; + int ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); + if (ret < 0) + return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { + xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { + xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } - ret = iptable_mangle_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_mangle_net_ops); - kfree(mangle_ops); - } - return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); + xt_unregister_template(&packet_mangler); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a9913842ef18..45d7e072e6a5 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -17,8 +17,6 @@ struct iptable_nat_pernet { struct nf_hook_ops *nf_nat_ops; }; -static int __net_init iptable_nat_table_init(struct net *net); - static unsigned int iptable_nat_net_id __read_mostly; static const struct xt_table nf_nat_ipv4_table = { @@ -29,7 +27,6 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, - .table_init = iptable_nat_table_init, }; static unsigned int iptable_nat_do_chain(void *priv, @@ -113,7 +110,7 @@ static void ipt_nat_unregister_lookups(struct net *net) kfree(ops); } -static int __net_init iptable_nat_table_init(struct net *net) +static int iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -155,20 +152,25 @@ static struct pernet_operations iptable_nat_net_ops = { static int __init iptable_nat_init(void) { - int ret = register_pernet_subsys(&iptable_nat_net_ops); + int ret = xt_register_template(&nf_nat_ipv4_table, + iptable_nat_table_init); + + if (ret < 0) + return ret; - if (ret) + ret = register_pernet_subsys(&iptable_nat_net_ops); + if (ret < 0) { + xt_unregister_template(&nf_nat_ipv4_table); return ret; + } - ret = iptable_nat_table_init(&init_net); - if (ret) - unregister_pernet_subsys(&iptable_nat_net_ops); return ret; } static void __exit iptable_nat_exit(void) { unregister_pernet_subsys(&iptable_nat_net_ops); + xt_unregister_template(&nf_nat_ipv4_table); } module_init(iptable_nat_init); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index ceef397c1f5f..b88e0f36cd05 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -12,8 +12,6 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static int __net_init iptable_raw_table_init(struct net *net); - static bool raw_before_defrag __read_mostly; MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); module_param(raw_before_defrag, bool, 0000); @@ -24,7 +22,6 @@ static const struct xt_table packet_raw = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, - .table_init = iptable_raw_table_init, }; static const struct xt_table packet_raw_before_defrag = { @@ -33,7 +30,6 @@ static const struct xt_table packet_raw_before_defrag = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, - .table_init = iptable_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -89,22 +85,24 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } + ret = xt_register_template(table, + iptable_raw_table_init); + if (ret < 0) + return ret; + rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) + if (IS_ERR(rawtable_ops)) { + xt_unregister_template(table); return PTR_ERR(rawtable_ops); + } ret = register_pernet_subsys(&iptable_raw_net_ops); if (ret < 0) { + xt_unregister_template(table); kfree(rawtable_ops); return ret; } - ret = iptable_raw_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_raw_net_ops); - kfree(rawtable_ops); - } - return ret; } @@ -112,6 +110,7 @@ static void __exit iptable_raw_fini(void) { unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); + xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 77973f5fd8f6..f519162a2fa5 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static int __net_init iptable_security_table_init(struct net *net); - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, - .table_init = iptable_security_table_init, }; static unsigned int @@ -45,7 +42,7 @@ iptable_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_table_init(struct net *net) +static int iptable_security_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -75,24 +72,25 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret; + int ret = xt_register_template(&security_table, + iptable_security_table_init); + + if (ret < 0) + return ret; sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) + if (IS_ERR(sectbl_ops)) { + xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); + } ret = register_pernet_subsys(&iptable_security_net_ops); if (ret < 0) { + xt_unregister_template(&security_table); kfree(sectbl_ops); return ret; } - ret = iptable_security_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_security_net_ops); - kfree(sectbl_ops); - } - return ret; } @@ -100,6 +98,7 @@ static void __exit iptable_security_fini(void) { unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); + xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ff437e4ed6db..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 4075230b14c6..75ca4b6e484f 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2490,6 +2490,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; @@ -2528,6 +2529,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 95a718397fd1..1e44a43acfe2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -573,7 +573,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) } } sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: sock_put(sk); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6d46297a99f8..b0d3a09dc84e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -295,6 +295,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), + SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), + SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9a8c0892622b..6913979948d7 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,12 +31,6 @@ EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - if (!prot->netns_ok) { - pr_err("Protocol %u is not namespace aware, cannot register.\n", - protocol); - return -EINVAL; - } - return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 50a73178d63a..bb446e60cf58 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -280,7 +280,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) if (inet->recverr || harderr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } } @@ -929,7 +929,7 @@ int raw_abort(struct sock *sk, int err) lock_sock(sk); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 1b5b8af27aaf..ccacbde30a2c 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -119,11 +119,8 @@ static int raw_diag_dump_one(struct netlink_callback *cb, return err; } - err = netlink_unicast(net->diag_nlsk, rep, - NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + return err; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a36ac98476f..d6899ab5fb39 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -276,12 +276,13 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } - seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " - " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, @@ -586,28 +587,35 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) } } -static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { - struct fib_nh_exception *fnhe, *oldest; + struct fib_nh_exception __rcu **fnhe_p, **oldest_p; + struct fib_nh_exception *fnhe, *oldest = NULL; - oldest = rcu_dereference(hash->chain); - for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; - fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { + fnhe = rcu_dereference_protected(*fnhe_p, + lockdep_is_held(&fnhe_lock)); + if (!fnhe) + break; + if (!oldest || + time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; + oldest_p = fnhe_p; + } } fnhe_flush_routes(oldest); - return oldest; + *oldest_p = oldest->fnhe_next; + kfree_rcu(oldest, rcu); } -static inline u32 fnhe_hashfun(__be32 daddr) +static u32 fnhe_hashfun(__be32 daddr) { - static u32 fnhe_hashrnd __read_mostly; - u32 hval; + static siphash_key_t fnhe_hash_key __read_mostly; + u64 hval; - net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); - return hash_32(hval, FNHE_HASH_SHIFT); + net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); + hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); + return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) @@ -676,16 +684,21 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, if (rt) fill_route_from_fnhe(rt, fnhe); } else { - if (depth > FNHE_RECLAIM_DEPTH) - fnhe = fnhe_oldest(hash); - else { - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); - if (!fnhe) - goto out_unlock; - - fnhe->fnhe_next = hash->chain; - rcu_assign_pointer(hash->chain, fnhe); + /* Randomize max depth to avoid some side channels attacks. */ + int max_depth = FNHE_RECLAIM_DEPTH + + prandom_u32_max(FNHE_RECLAIM_DEPTH); + + while (depth > max_depth) { + fnhe_remove_oldest(hash); + depth--; } + + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; @@ -693,6 +706,8 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); + rcu_assign_pointer(hash->chain, fnhe); + /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. @@ -1299,25 +1314,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; - unsigned int mtu = rt->rt_pmtu; - - if (!mtu || time_after_eq(jiffies, rt->dst.expires)) - mtu = dst_metric_raw(dst, RTAX_MTU); - - if (mtu) - return mtu; - - mtu = READ_ONCE(dst->dev->mtu); - - if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) - mtu = 576; - } - - mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); @@ -1906,13 +1903,128 @@ out: hash_keys->addrs.v4addrs.dst = key_iph->daddr; } +static u32 fib_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 fib_multipath_custom_hash_fl4(const struct net *net, + const struct flowi4 *fl4) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = fl4->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = fl4->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl4->flowi4_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl4->fl4_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (net->ipv4.sysctl_fib_multipath_hash_policy) { case 0: @@ -1924,6 +2036,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -1957,6 +2070,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -1987,9 +2101,15 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); + break; + case 3: + if (skb) + mhash = fib_multipath_custom_hash_skb(net, skb); + else + mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } - mhash = flow_hash_from_keys(&hash_keys); if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); @@ -2707,8 +2827,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->output = dst_discard_out; new->dev = net->loopback_dev; - if (new->dev) - dev_hold(new->dev); + dev_hold(new->dev); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; @@ -3046,7 +3165,7 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; - udph->len = sizeof(struct udphdr); + udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a62934b9f15a..6f1e64d49232 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -19,6 +19,7 @@ #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_fib.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> @@ -29,6 +30,7 @@ #include <net/netevent.h> static int two = 2; +static int three __maybe_unused = 3; static int four = 4; static int thousand = 1000; static int tcp_retr1_max = 255; @@ -48,6 +50,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -461,6 +465,22 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, return ret; } + +static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + + return ret; +} #endif static struct ctl_table ipv4_table[] = { @@ -941,6 +961,15 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_migrate_req", + .data = &init_net.ipv4.sysctl_tcp_migrate_req, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -1050,7 +1079,16 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, + }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_fib_multipath_hash_fields, + .extra1 = SYSCTL_ONE, + .extra2 = &fib_multipath_hash_fields_all_mask, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1c1f9e3de72..e8b48df73c85 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1375,6 +1375,9 @@ new_segment: } pfrag->offset += copy; } else { + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_space; + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); @@ -1738,8 +1741,8 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); @@ -2024,8 +2027,6 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss); static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) @@ -2095,8 +2096,8 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_lock(current->mm); - vma = find_vma(current->mm, address); - if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { + vma = vma_lookup(current->mm, address); + if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(current->mm); return -EINVAL; } @@ -2197,8 +2198,8 @@ out: #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss) +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; @@ -3061,7 +3062,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.offset = 0; } - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); @@ -3337,6 +3338,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) } else { tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; + tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); } return 0; } @@ -4450,7 +4452,7 @@ int tcp_abort(struct sock *sk, int err) sk->sk_err = err; /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); @@ -4511,7 +4513,9 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); /* Size and allocate the main established and bind bucket * hash tables. diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6ea3dc2e4219..6274462b86b4 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1041,7 +1041,7 @@ static void bbr_init(struct sock *sk) bbr->prior_cwnd = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = 0; + bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; bbr->packet_conservation = 0; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..d3e9386b493e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -184,11 +206,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = tcp_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -196,14 +218,9 @@ msg_bytes_ready: sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; -out: release_sock(sk); sk_psock_put(sk, psock); return ret; @@ -486,7 +503,7 @@ static int __init tcp_bpf_v4_build_proto(void) tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } -core_initcall(tcp_bpf_v4_build_proto); +late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index af2814c9342a..59412d6354a0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -55,12 +55,7 @@ void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - - ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); @@ -89,18 +84,12 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, ctx->num = 1; } - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - octx = rcu_dereference_protected(q->ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(q->ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); } else { - octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); } - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); @@ -379,8 +368,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, return NULL; } - if (syn_data && - tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) + if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { @@ -507,8 +495,18 @@ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); + if (!sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout) + return; + + /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ + WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies); + + /* Paired with smp_rmb() in tcp_fastopen_active_should_disable(). + * We want net->ipv4.tfo_active_disable_stamp to be updated first. + */ + smp_mb__before_atomic(); atomic_inc(&net->ipv4.tfo_active_disable_times); - net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } @@ -519,17 +517,27 @@ void tcp_fastopen_active_disable(struct sock *sk) bool tcp_fastopen_active_should_disable(struct sock *sk) { unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; - int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int tfo_da_times; int multiplier; + if (!tfo_bh_timeout) + return false; + + tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); if (!tfo_da_times) return false; - /* Limit timout to max: 2^6 * initial timeout */ + /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */ + smp_rmb(); + + /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * tfo_bh_timeout * HZ; - if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) + + /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */ + timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) + + multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, timeout)) return true; /* Mark check bit so we can check for successful active TFO diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf4dd532d1c..3f7bd7ae7d7a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ +#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -454,11 +455,12 @@ static void tcp_sndbuf_expand(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, + unsigned int skbtruesize) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { @@ -471,7 +473,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing + * can play nice with us, as sk_buff and skb->head might be either + * freed or shared with up to MAX_SKB_FRAGS segments. + * Only give a boost to drivers using page frag(s) to hold the frame(s), + * and if no payload was pulled in skb->head before reaching us. + */ +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) +{ + u32 truesize = skb->truesize; + + if (adjust && !skb_headlen(skb)) { + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); + /* paranoid check, some drivers might be buggy */ + if (unlikely((int)truesize < (int)skb->len)) + truesize = skb->truesize; + } + return truesize; +} + +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, + bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; @@ -480,15 +502,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (room > 0 && !tcp_under_memory_pressure(sk)) { + unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); @@ -782,7 +805,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -969,6 +992,8 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) + state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ @@ -976,7 +1001,14 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; - tp->rack.dsack_seen = 1; + /* We increase the RACK ordering window in rounds where we receive + * DSACKs that may have been due to reordering causing RACK to trigger + * a spurious fast recovery. Thus RACK ignores DSACKs that happen + * without having seen reordering, or that match TLP probes (TLP + * is timer-driven, not triggered by RACK). + */ + if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) + tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ @@ -2816,8 +2848,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); @@ -2842,7 +2883,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2866,14 +2909,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2943,17 +2978,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); @@ -3621,7 +3660,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (!tp->tlp_retrans) { /* TLP of new data has been acknowledged */ tp->tlp_high_seq = 0; - } else if (flag & FLAG_DSACKING_ACK) { + } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -4240,6 +4279,9 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + /* mptcp can't tell us to ignore reset pkts, + * so just ignore the return value of mptcp_incoming_options(). + */ if (sk_is_mptcp(sk)) mptcp_incoming_options(sk, skb); @@ -4263,7 +4305,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } /* @@ -4759,7 +4801,7 @@ coalesce_done: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4847,7 +4889,7 @@ end: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } @@ -4934,8 +4976,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; - if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb); + /* If a subflow has been reset, the packet should not continue + * to be processed, drop the packet. + */ + if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) { + __kfree_skb(skb); + return; + } if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -5368,7 +5415,7 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - sk->sk_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static void tcp_check_space(struct sock *sk) @@ -5885,6 +5932,7 @@ step5: return; csum_error: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -5914,8 +5962,8 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; - icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); + /* Initialize congestion control unless BPF initialized it already: */ if (!icsk->icsk_ca_initialized) tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); @@ -6515,8 +6563,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSING: case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb); + /* If a subflow has been reset, the packet should not + * continue to be processed, drop the packet. + */ + if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) + goto discard; break; } fallthrough; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 312184cead57..2e62e0d6373a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -342,7 +342,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; - mtu = tcp_sk(sk)->mtu_info; + mtu = READ_ONCE(tcp_sk(sk)->mtu_info); dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -546,7 +546,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_LISTEN) goto out; - tp->mtu_info = info; + WRITE_ONCE(tp->mtu_info, info); if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { @@ -585,7 +585,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (!sock_owned_by_user(sk)) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_done(sk); } else { @@ -613,7 +613,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } @@ -1731,6 +1731,7 @@ discard: return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1801,6 +1802,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); + trace_tcp_bad_csum(skb); __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -2000,13 +2002,21 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -2098,6 +2108,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); @@ -2266,51 +2277,72 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -/* - * Get next listener socket follow cur. If cur is NULL, get first socket - * starting from bucket given in st->bucket; when st->bucket is zero the - * very first socket in the hash table is returned. +static unsigned short seq_file_family(const struct seq_file *seq); + +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) +{ + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + +/* Find a non empty bucket (starting from st->bucket) + * and return the first sk from it. */ -static void *listening_get_next(struct seq_file *seq, void *cur) +static void *listening_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - struct sock *sk = cur; - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk; - if (!sk) { -get_head: - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock(&ilb->lock); - sk = sk_nulls_head(&ilb->nulls_head); - st->offset = 0; - goto get_sk; + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + if (hlist_empty(&ilb2->head)) + continue; + + spin_lock(&ilb2->lock); + inet_lhash2_for_each_icsk(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) + return sk; + } + spin_unlock(&ilb2->lock); } - ilb = &tcp_hashinfo.listening_hash[st->bucket]; + + return NULL; +} + +/* Find the next sk of "cur" within the same bucket (i.e. st->bucket). + * If "cur" is the last one in the st->bucket, + * call listening_get_first() to return the first sk of the next + * non empty bucket. + */ +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct tcp_iter_state *st = seq->private; + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk = cur; + ++st->num; ++st->offset; - sk = sk_nulls_next(sk); -get_sk: - sk_nulls_for_each_from(sk, node) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + icsk = inet_csk(sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) return sk; } - spin_unlock(&ilb->lock); - st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) - goto get_head; - return NULL; + + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + spin_unlock(&ilb2->lock); + ++st->bucket; + return listening_get_first(seq); } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2320,7 +2352,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) st->bucket = 0; st->offset = 0; - rc = listening_get_next(seq, NULL); + rc = listening_get_first(seq); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2340,15 +2372,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - void *rc = NULL; - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { @@ -2362,32 +2386,20 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if ((afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family) || - !net_eq(sock_net(sk), net)) { - continue; - } - rc = sk; - goto out; + if (seq_sk_match(seq, sk)) + return sk; } spin_unlock_bh(lock); } -out: - return rc; + + return NULL; } static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); ++st->num; ++st->offset; @@ -2395,9 +2407,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if ((afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) && - net_eq(sock_net(sk), net)) + if (seq_sk_match(seq, sk)) return sk; } @@ -2440,17 +2450,18 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; switch (st->state) { case TCP_SEQ_STATE_LISTENING: - if (st->bucket >= INET_LHTABLE_SIZE) + if (st->bucket > tcp_hashinfo.lhash2_mask) break; st->state = TCP_SEQ_STATE_LISTENING; - rc = listening_get_next(seq, NULL); - while (offset-- && rc) + rc = listening_get_first(seq); + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2461,7 +2472,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } @@ -2531,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) @@ -2676,6 +2687,15 @@ out: } #ifdef CONFIG_BPF_SYSCALL +struct bpf_tcp_iter_state { + struct tcp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__tcp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct sock_common *, sk_common); @@ -2694,16 +2714,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_tcp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct inet_connection_sock *icsk; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + icsk = inet_csk(start_sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + + return expected; +} + +static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct hlist_nulls_node *node; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + + return expected; +} + +static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + unsigned int expected; + bool resized = false; + struct sock *sk; + + /* The st->bucket is done. Directly advance to the next + * bucket instead of having the tcp_seek_last_pos() to skip + * one by one in the current bucket and eventually find out + * it has to advance to the next bucket. + */ + if (iter->st_bucket_done) { + st->offset = 0; + st->bucket++; + if (st->state == TCP_SEQ_STATE_LISTENING && + st->bucket > tcp_hashinfo.lhash2_mask) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + } + } + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + + sk = tcp_seek_last_pos(seq); + if (!sk) + return NULL; /* Done */ + + if (st->state == TCP_SEQ_STATE_LISTENING) + expected = bpf_iter_tcp_listening_batch(seq, sk); + else + expected = bpf_iter_tcp_established_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_tcp_batch(seq); + + return SEQ_START_TOKEN; +} + +static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) { + /* Keeping st->num consistent in tcp_iter_state. + * bpf_iter_tcp does not use st->num. + * meta.seq_num is used instead. + */ + st->num++; + /* Move st->offset to the next sk in the bucket such that + * the future start() will resume at st->offset in + * st->bucket. See tcp_seek_last_pos(). + */ + st->offset++; + sock_put(iter->batch[iter->cur_sk++]); + } + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_tcp_batch(seq); + + ++*pos; + /* Keeping st->last_pos consistent in tcp_iter_state. + * bpf iter does not do lseek, so st->last_pos always equals to *pos. + */ + st->last_pos = *pos; + return sk; +} + static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; + bool slow; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + if (sk_fullsock(sk)) + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + if (sk->sk_state == TCP_TIME_WAIT) { uid = 0; } else if (sk->sk_state == TCP_NEW_SYN_RECV) { @@ -2717,11 +2925,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return tcp_prog_seq_show(prog, &meta, v, uid); + ret = tcp_prog_seq_show(prog, &meta, v, uid); + +unlock: + if (sk_fullsock(sk)) + unlock_sock_fast(sk, slow); + return ret; + } static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_tcp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -2732,16 +2947,33 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) (void)tcp_prog_seq_show(prog, &meta, v, 0); } - tcp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_tcp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_tcp_seq_ops = { .show = bpf_iter_tcp_seq_show, - .start = tcp_seq_start, - .next = tcp_seq_next, + .start = bpf_iter_tcp_seq_start, + .next = bpf_iter_tcp_seq_next, .stop = bpf_iter_tcp_seq_stop, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct tcp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ + if (seq->op == &bpf_iter_tcp_seq_ops) + return AF_UNSPEC; +#endif + + /* Iterated from proc fs */ + afinfo = PDE_DATA(file_inode(seq->file)); + return afinfo->family; +} static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, @@ -2953,8 +3185,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; - spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); - net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); /* Reno is always built in */ @@ -2992,39 +3223,55 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) +#define INIT_BATCH_SZ 16 + static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { - struct tcp_iter_state *st = priv_data; - struct tcp_seq_afinfo *afinfo; - int ret; + struct bpf_tcp_iter_state *iter = priv_data; + int err; - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) - return -ENOMEM; + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; - afinfo->family = AF_UNSPEC; - st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data, aux); - if (ret) - kfree(afinfo); - return ret; + err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; } static void bpf_iter_fini_tcp(void *priv_data) { - struct tcp_iter_state *st = priv_data; + struct bpf_tcp_iter_state *iter = priv_data; - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, - .seq_priv_size = sizeof(struct tcp_iter_state), + .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, @@ -3032,6 +3279,7 @@ static struct bpf_iter_reg tcp_reg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7513ba45553d..0a4f3f16140a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } @@ -786,6 +786,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + if (sk != req->rsk_listener) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index e09147ac9a99..fc61cd3fea65 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -298,6 +298,9 @@ int tcp_gro_complete(struct sk_buff *skb) if (th->cwr) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + if (skb->encapsulation) + skb->inner_transport_header = skb->transport_header; + return 0; } EXPORT_SYMBOL(tcp_gro_complete); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bde781f46b41..6d72f3ea48c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1732,6 +1732,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } +EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -3372,7 +3373,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_charge_skmem(sk->sk_memcg, amt); + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 6f1b4ac7fe99..fd113f6226ef 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -172,7 +172,8 @@ void tcp_rack_reo_timeout(struct sock *sk) /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * - * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ef08079ccfa..20cf4a98c69d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -68,7 +68,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_write_queue_purge(sk); tcp_done(sk); @@ -441,7 +441,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * This function gets called when the kernel timer for a TCP packet * of this socket expires. * - * It handles retransmission, timer adjustment and other necesarry measures. + * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ @@ -766,7 +766,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, - * substract one from tp->compressed_ack to keep + * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3bb448761ca3..07c4c93b9fdb 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -221,7 +221,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = { static int __init tcp_yeah_register(void) { - BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_yeah); return 0; } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index e44aaf41a138..5048c47c79b2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -218,7 +218,6 @@ static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, - .netns_ok = 1, }; #if IS_ENABLED(CONFIG_IPV6) @@ -226,7 +225,6 @@ static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, - .netns_ok = 1, }; #endif @@ -235,7 +233,6 @@ static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..8851c9463b4b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -645,10 +645,12 @@ static struct sock *__udp4_lib_err_encap(struct net *net, const struct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, + struct sock *sk, struct sk_buff *skb, u32 info) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; - struct sock *sk; + struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); @@ -659,18 +661,28 @@ static struct sock *__udp4_lib_err_encap(struct net *net, /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, iph->ihl << 2); + if (sk) { + up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (lookup && lookup(sk, skb)) + sk = NULL; + + goto out; + } + sk = __udp4_lib_lookup(net, iph->daddr, uh->source, iph->saddr, uh->dest, skb->dev->ifindex, 0, udptable, NULL); if (sk) { - int (*lookup)(struct sock *sk, struct sk_buff *skb); - struct udp_sock *up = udp_sk(sk); + up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } +out: if (!sk) sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); @@ -707,15 +719,16 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ - sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udp_encap_needed_key)) { - sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, + sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, info); if (!sk) return 0; - } + } else + sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -776,7 +789,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: return 0; } @@ -1102,7 +1115,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipcm_init_sk(&ipc, inet); - ipc.gso_size = up->gso_size; + ipc.gso_size = READ_ONCE(up->gso_size); if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); @@ -1130,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1798,11 +1811,13 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, if (used <= 0) { if (!copied) copied = used; + kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } + kfree_skb(skb); if (!desc->count) break; } @@ -2693,7 +2708,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_SEGMENT: if (val < 0 || val > USHRT_MAX) return -EINVAL; - up->gso_size = val; + WRITE_ONCE(up->gso_size, val); break; case UDP_GRO: @@ -2788,7 +2803,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, break; case UDP_SEGMENT: - val = up->gso_size; + val = READ_ONCE(up->gso_size); break; case UDP_GRO: @@ -2867,7 +2882,7 @@ int udp_abort(struct sock *sk, int err) goto out; sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); out: diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..7a1d5f473878 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); } +static bool udp_sk_has_data(struct sock *sk) +{ + return !skb_queue_empty(&udp_sk(sk)->reader_queue) || + !skb_queue_empty(&sk->sk_receive_queue); +} + +static bool psock_has_data(struct sk_psock *psock) +{ + return !skb_queue_empty(&psock->ingress_skb) || + !sk_psock_queue_empty(psock); +} + +#define udp_msg_has_data(__sk, __psock) \ + ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) + +static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = udp_msg_has_data(sk, psock); + if (!ret) { + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + ret = udp_msg_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); - if (sk_psock_queue_empty(psock)) { + if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } @@ -43,26 +81,21 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = udp_msg_wait_data(sk, psock, timeo); if (data) { - if (!sk_psock_queue_empty(psock)) + if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; out: - release_sock(sk); sk_psock_put(sk, psock); return ret; } @@ -79,7 +112,6 @@ static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; - prot->unhash = sock_map_unhash; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; } @@ -101,7 +133,7 @@ static int __init udp_bpf_v4_build_proto(void) udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); return 0; } -core_initcall(udp_bpf_v4_build_proto); +late_initcall(udp_bpf_v4_build_proto); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index b2cee9a307d4..1ed8c4d78e5c 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -77,10 +77,8 @@ static int udp_dump_one(struct udp_table *tbl, kfree_skb(rep); goto out; } - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + out: if (sk) sock_put(sk); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 54e06b88af69..86d32a1e62ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -152,8 +152,8 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { + const struct net_offload __rcu **offloads; __be16 protocol = skb->protocol; - const struct net_offload **offloads; const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, @@ -525,8 +525,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) - pp = call_gro_receive(udp_gro_receive_segment, head, skb); - return pp; + return call_gro_receive(udp_gro_receive_segment, head, skb); + + /* no GRO, be sure flush the current packet */ + goto out; } if (NAPI_GRO_CB(skb)->encap_mark || @@ -622,6 +624,10 @@ static int udp_gro_complete_segment(struct sk_buff *skb) skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; + + if (skb->encapsulation) + skb->inner_transport_header = skb->transport_header; + return 0; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index bd8773b49e72..cd1cd68adeec 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,6 @@ static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, - .netns_ok = 1, }; struct proto udplite_prot = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index ea595c8549c7..2fe5860c21d6 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -181,21 +181,18 @@ static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index fb0648e7fb32..f4555a88f86b 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -42,7 +42,6 @@ static void ipip_destroy(struct xfrm_state *x) } static const struct xfrm_type ipip_type = { - .description = "IPIP", .owner = THIS_MODULE, .proto = IPPROTO_IPIP, .init_state = ipip_init_state, diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 747f56e0c636..e504204bca92 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -328,4 +328,15 @@ config IPV6_RPL_LWTUNNEL If unsure, say N. +config IPV6_IOAM6_LWTUNNEL + bool "IPv6: IOAM Pre-allocated Trace insertion support" + depends on IPV6 + select LWTUNNEL + help + Support for the inline insertion of IOAM Pre-allocated + Trace Header (only on locally generated packets), using + the lightweight tunnels mechanism. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index cf7b47bdb9b3..1bc7e143217b 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -10,7 +10,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o fib6_notifier.o rpl.o + udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o @@ -27,6 +27,7 @@ ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o +ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 701eb82acd1c..c6a90b7bbb70 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -89,6 +89,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> +#include <linux/ioam6.h> #define INFINITY_LIFE_TIME 0xFFFFFFFF @@ -237,6 +238,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -293,6 +297,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -387,6 +394,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; ndev->cnf.mtu6 = dev->mtu; + ndev->ra_mtu = 0; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (!ndev->nd_parms) { kfree(ndev); @@ -694,8 +702,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, errout: if (in6_dev) in6_dev_put(in6_dev); - if (dev) - dev_put(dev); + dev_put(dev); return err; } @@ -1080,7 +1087,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - ifa = kzalloc(sizeof(*ifa), gfp_flags); + ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT); if (!ifa) { err = -ENOBUFS; goto out; @@ -3085,19 +3092,22 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if IS_ENABLED(CONFIG_IPV6_SIT) -static void sit_add_v4_addrs(struct inet6_dev *idev) +#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +static void add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen; + int scope, plen, offset = 0; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ + if (idev->dev->addr_len == sizeof(struct in6_addr)) + offset = sizeof(struct in6_addr) - 4; + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = htonl(0xfe800000); @@ -3335,8 +3345,6 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && - (dev->type != ARPHRD_IP6GRE) && - (dev->type != ARPHRD_IPGRE) && (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { @@ -3384,14 +3392,14 @@ static void addrconf_sit_config(struct net_device *dev) return; } - sit_add_v4_addrs(idev); + add_v4_addrs(idev); if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); } #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; @@ -3404,7 +3412,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - addrconf_addr_gen(idev, true); + if (dev->type == ARPHRD_ETHER) { + addrconf_addr_gen(idev, true); + return; + } + + add_v4_addrs(idev); + if (dev->flags & IFF_POINTOPOINT) addrconf_add_mroute(dev); } @@ -3580,7 +3594,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_sit_config(dev); break; #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: case ARPHRD_IPGRE: addrconf_gre_config(dev); break; @@ -3843,6 +3858,7 @@ restart: } idev->tstamp = jiffies; + idev->ra_mtu = 0; /* Last: Shot the device (if unregistered) */ if (unregister) { @@ -5211,8 +5227,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .netnsid = -1, .type = type, }; - struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; + struct net *tgt_net = sock_net(skb->sk); int idx, s_idx, s_ip_idx; int h, s_h; struct net_device *dev; @@ -5351,7 +5366,7 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct net *net = sock_net(in_skb->sk); + struct net *tgt_net = sock_net(in_skb->sk); struct inet6_fill_args fillargs = { .portid = NETLINK_CB(in_skb).portid, .seq = nlh->nlmsg_seq, @@ -5359,7 +5374,6 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .flags = 0, .netnsid = -1, }; - struct net *tgt_net = net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL, *peer; @@ -5412,8 +5426,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: - if (dev) - dev_put(dev); + dev_put(dev); if (fillargs.netnsid >= 0) put_net(tgt_net); @@ -5526,6 +5539,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; + array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled; + array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; + array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; } static inline size_t inet6_ifla6_size(void) @@ -5537,6 +5553,7 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + nla_total_size(4) /* IFLA_INET6_RA_MTU */ + 0; } @@ -5645,6 +5662,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) goto nla_put_failure; + if (idev->ra_mtu && + nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -5761,6 +5782,9 @@ update_lft: static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = { [IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 }, [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) }, + [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT, + .reject_message = + "IFLA_INET6_RA_MTU can not be set" }, }; static int check_addr_gen_mode(int mode) @@ -5784,7 +5808,8 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net, } static int inet6_validate_link_af(const struct net_device *dev, - const struct nlattr *nla) + const struct nlattr *nla, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_INET6_MAX + 1]; struct inet6_dev *idev = NULL; @@ -5797,7 +5822,7 @@ static int inet6_validate_link_af(const struct net_device *dev, } err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, - inet6_af_policy, NULL); + inet6_af_policy, extack); if (err) return err; @@ -6540,6 +6565,7 @@ static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, static int minus_one = -1; static const int two_five_five = 255; +static u32 ioam6_if_id_max = U16_MAX; static const struct ctl_table addrconf_sysctl[] = { { @@ -6903,10 +6929,10 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { - .procname = "addr_gen_mode", - .data = &ipv6_devconf.addr_gen_mode, - .maxlen = sizeof(int), - .mode = 0644, + .procname = "addr_gen_mode", + .data = &ipv6_devconf.addr_gen_mode, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = addrconf_sysctl_addr_gen_mode, }, { @@ -6933,6 +6959,31 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "ioam6_enabled", + .data = &ipv6_devconf.ioam6_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)SYSCTL_ONE, + }, + { + .procname = "ioam6_id", + .data = &ipv6_devconf.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)&ioam6_if_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &ipv6_devconf.ioam6_id_wide, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2389ff702f51..b5878bb8e419 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -62,6 +62,7 @@ #include <net/rpl.h> #include <net/compat.h> #include <net/xfrm.h> +#include <net/ioam6.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -454,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -531,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -540,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, @@ -961,6 +962,9 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); + net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID; + net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE; + err = ipv6_init_mibs(net); if (err) return err; @@ -1191,6 +1195,10 @@ static int __init inet6_init(void) if (err) goto rpl_fail; + err = ioam6_init(); + if (err) + goto ioam6_fail; + err = igmp6_late_init(); if (err) goto igmp6_late_err; @@ -1213,6 +1221,8 @@ sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: + ioam6_exit(); +ioam6_fail: rpl_exit(); rpl_fail: seg6_exit(); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 20d492da725a..828e62514260 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -755,7 +755,6 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ah6_type = { - .description = "AH6", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, @@ -763,7 +762,6 @@ static const struct xfrm_type ah6_type = { .destructor = ah6_destroy, .input = ah6_input, .output = ah6_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol ah6_protocol = { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 393ae2b78e7d..ed2f061b8768 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -708,7 +708,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -1243,7 +1243,6 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type esp6_type = { - .description = "ESP6", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, @@ -1251,7 +1250,6 @@ static const struct xfrm_type esp6_type = { .destructor = esp6_destroy, .input = esp6_input, .output = esp6_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol esp6_protocol = { diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 40ed4fcf1cf4..a349d4798077 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -377,7 +377,6 @@ static const struct net_offload esp6_offload = { }; static const struct xfrm_type_offload esp6_type_offload = { - .description = "ESP6 OFFLOAD", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp6_input_tail, diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 56e479d158b7..3a871a09f962 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -49,22 +49,12 @@ #include <net/seg6_hmac.h> #endif #include <net/rpl.h> +#include <linux/ioam6.h> +#include <net/ioam6.h> +#include <net/dst_metadata.h> #include <linux/uaccess.h> -/* - * Parsing tlv encoded headers. - * - * Parsing function "func" returns true, if parsing succeed - * and false, if it failed. - * It MUST NOT touch skb->h. - */ - -struct tlvtype_proc { - int type; - bool (*func)(struct sk_buff *skb, int offset); -}; - /********************* Generic functions *********************/ @@ -109,16 +99,23 @@ drop: return false; } +static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); +static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); +static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); +static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); +#endif + /* Parse tlv encoded option header (hop-by-hop or destination) */ -static bool ip6_parse_tlv(const struct tlvtype_proc *procs, +static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); - const struct tlvtype_proc *curr; bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; @@ -135,18 +132,23 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, len -= 2; while (len > 0) { - int optlen = nh[off + 1] + 2; - int i; + int optlen, i; - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; + if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; - break; + off++; + len--; + continue; + } + if (len < 2) + goto bad; + optlen = nh[off + 1] + 2; + if (optlen > len) + goto bad; - case IPV6_TLV_PADN: + if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. @@ -163,32 +165,51 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, if (nh[off + i] != 0) goto bad; } - break; - - default: /* Other TLV code so scan list */ - if (optlen > len) - goto bad; - + } else { tlv_count++; if (tlv_count > max_count) goto bad; - for (curr = procs; curr->type >= 0; curr++) { - if (curr->type == nh[off]) { - /* type specific length/alignment - checks will be performed in the - func(). */ - if (curr->func(skb, off) == false) + if (hopbyhop) { + switch (nh[off]) { + case IPV6_TLV_ROUTERALERT: + if (!ipv6_hop_ra(skb, off)) + return false; + break; + case IPV6_TLV_IOAM: + if (!ipv6_hop_ioam(skb, off)) + return false; + break; + case IPV6_TLV_JUMBO: + if (!ipv6_hop_jumbo(skb, off)) + return false; + break; + case IPV6_TLV_CALIPSO: + if (!ipv6_hop_calipso(skb, off)) + return false; + break; + default: + if (!ip6_tlvopt_unknown(skb, off, + disallow_unknowns)) + return false; + break; + } + } else { + switch (nh[off]) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPV6_TLV_HAO: + if (!ipv6_dest_hao(skb, off)) + return false; + break; +#endif + default: + if (!ip6_tlvopt_unknown(skb, off, + disallow_unknowns)) return false; break; } } - if (curr->type < 0 && - !ip6_tlvopt_unknown(skb, off, disallow_unknowns)) - return false; - padlen = 0; - break; } off += optlen; len -= optlen; @@ -265,16 +286,6 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) } #endif -static const struct tlvtype_proc tlvprocdestopt_lst[] = { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - { - .type = IPV6_TLV_HAO, - .func = ipv6_dest_hao, - }, -#endif - {-1, NULL} -}; - static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); @@ -305,8 +316,7 @@ fail_and_free: dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(tlvprocdestopt_lst, skb, - init_net.ipv6.sysctl.max_dst_opts_cnt)) { + if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -929,6 +939,60 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) return false; } +/* IOAM */ + +static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) +{ + struct ioam6_trace_hdr *trace; + struct ioam6_namespace *ns; + struct ioam6_hdr *hdr; + + /* Bad alignment (must be 4n-aligned) */ + if (optoff & 3) + goto drop; + + /* Ignore if IOAM is not enabled on ingress */ + if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) + goto ignore; + + /* Truncated Option header */ + hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); + if (hdr->opt_len < 2) + goto drop; + + switch (hdr->type) { + case IOAM6_TYPE_PREALLOC: + /* Truncated Pre-allocated Trace header */ + if (hdr->opt_len < 2 + sizeof(*trace)) + goto drop; + + /* Malformed Pre-allocated Trace header */ + trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); + if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) + goto drop; + + /* Ignore if the IOAM namespace is unknown */ + ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); + if (!ns) + goto ignore; + + if (!skb_valid_dst(skb)) + ip6_route_input(skb); + + ioam6_fill_trace_data(skb, ns, trace); + break; + default: + break; + } + +ignore: + return true; + +drop: + kfree_skb(skb); + return false; +} + /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) @@ -995,22 +1059,6 @@ drop: return false; } -static const struct tlvtype_proc tlvprochopopt_lst[] = { - { - .type = IPV6_TLV_ROUTERALERT, - .func = ipv6_hop_ra, - }, - { - .type = IPV6_TLV_JUMBO, - .func = ipv6_hop_jumbo, - }, - { - .type = IPV6_TLV_CALIPSO, - .func = ipv6_hop_calipso, - }, - { -1, } -}; - int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); @@ -1036,8 +1084,7 @@ fail_and_free: goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(tlvprochopopt_lst, skb, - init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 8f9a83314de7..40f3e4f9f33a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -467,7 +467,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; - int err = -ENOMEM; + int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index e8398ffb5e35..a7c31ab67c5d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -725,6 +725,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); bool acast; + u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) @@ -740,8 +741,13 @@ static void icmpv6_echo_reply(struct sk_buff *skb) !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; + if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) + type = ICMPV6_EXT_ECHO_REPLY; + else + type = ICMPV6_ECHO_REPLY; + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); - tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) @@ -752,7 +758,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (saddr) fl6.saddr = *saddr; fl6.flowi6_oif = icmp6_iif(skb); - fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); @@ -783,13 +789,17 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.skb = skb; msg.offset = 0; - msg.type = ICMPV6_ECHO_REPLY; + msg.type = type; ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; + if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) + if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr)) + goto out_dst_release; + if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, @@ -911,6 +921,11 @@ static int icmpv6_rcv(struct sk_buff *skb) if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) icmpv6_echo_reply(skb); break; + case ICMPV6_EXT_ECHO_REQUEST: + if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && + net->ipv4.sysctl_icmp_echo_enable_probe) + icmpv6_echo_reply(skb); + break; case ICMPV6_ECHO_REPLY: success = ping_rcv(skb); diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c new file mode 100644 index 000000000000..5e8961004832 --- /dev/null +++ b/net/ipv6/ioam6.c @@ -0,0 +1,910 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/ioam6.h> +#include <linux/ioam6_genl.h> +#include <linux/rhashtable.h> + +#include <net/addrconf.h> +#include <net/genetlink.h> +#include <net/ioam6.h> + +static void ioam6_ns_release(struct ioam6_namespace *ns) +{ + kfree_rcu(ns, rcu); +} + +static void ioam6_sc_release(struct ioam6_schema *sc) +{ + kfree_rcu(sc, rcu); +} + +static void ioam6_free_ns(void *ptr, void *arg) +{ + struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr; + + if (ns) + ioam6_ns_release(ns); +} + +static void ioam6_free_sc(void *ptr, void *arg) +{ + struct ioam6_schema *sc = (struct ioam6_schema *)ptr; + + if (sc) + ioam6_sc_release(sc); +} + +static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_namespace *ns = obj; + + return (ns->id != *(__be16 *)arg->key); +} + +static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_schema *sc = obj; + + return (sc->id != *(u32 *)arg->key); +} + +static const struct rhashtable_params rht_ns_params = { + .key_len = sizeof(__be16), + .key_offset = offsetof(struct ioam6_namespace, id), + .head_offset = offsetof(struct ioam6_namespace, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_ns_cmpfn, +}; + +static const struct rhashtable_params rht_sc_params = { + .key_len = sizeof(u32), + .key_offset = offsetof(struct ioam6_schema, id), + .head_offset = offsetof(struct ioam6_schema, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_sc_cmpfn, +}; + +static struct genl_family ioam6_genl_family; + +static const struct nla_policy ioam6_genl_policy_addns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 }, + [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 }, +}; + +static const struct nla_policy ioam6_genl_policy_delns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, +}; + +static const struct nla_policy ioam6_genl_policy_addsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY, + .len = IOAM6_MAX_SCHEMA_DATA_LEN }, +}; + +static const struct nla_policy ioam6_genl_policy_delsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ioam6_genl_policy_ns_sc[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG }, +}; + +static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + u64 data64; + u32 data32; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (ns) { + err = -EEXIST; + goto out_unlock; + } + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) { + err = -ENOMEM; + goto out_unlock; + } + + ns->id = id; + + if (!info->attrs[IOAM6_ATTR_NS_DATA]) + data32 = IOAM6_U32_UNAVAILABLE; + else + data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); + + if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) + data64 = IOAM6_U64_UNAVAILABLE; + else + data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + + ns->data = cpu_to_be32(data32); + ns->data_wide = cpu_to_be64(data64); + + err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + kfree(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + sc = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + goto out_unlock; + + if (sc) + rcu_assign_pointer(sc->ns, NULL); + + ioam6_ns_release(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns, + u32 portid, + u32 seq, + u32 flags, + struct sk_buff *skb, + u8 cmd) +{ + struct ioam6_schema *sc; + u64 data64; + u32 data32; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + data32 = be32_to_cpu(ns->data); + data64 = be64_to_cpu(ns->data_wide); + + if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) || + (data32 != IOAM6_U32_UNAVAILABLE && + nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) || + (data64 != IOAM6_U64_UNAVAILABLE && + nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE, + data64, IOAM6_ATTR_PAD))) + goto nla_put_failure; + + rcu_read_lock(); + + sc = rcu_dereference(ns->schema); + if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpns_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->namespaces, iter); + + return 0; +} + +static int ioam6_genl_dumpns_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_namespace *ns; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + ns = rhashtable_walk_next(iter); + + if (IS_ERR(ns)) { + if (PTR_ERR(ns) == -EAGAIN) + continue; + err = PTR_ERR(ns); + goto done; + } else if (!ns) { + break; + } + + err = __ioam6_genl_dumpns_element(ns, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_NAMESPACES); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + int len, len_aligned, err; + struct ioam6_schema *sc; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (sc) { + err = -EEXIST; + goto out_unlock; + } + + len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]); + len_aligned = ALIGN(len, 4); + + sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL); + if (!sc) { + err = -ENOMEM; + goto out_unlock; + } + + sc->id = id; + sc->len = len_aligned; + sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24)); + nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len); + + err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto free_sc; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +free_sc: + kfree(sc); + goto out_unlock; +} + +static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + int err; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + + ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto out_unlock; + + if (ns) + rcu_assign_pointer(ns->schema, NULL); + + ioam6_sc_release(sc); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc, + u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) +{ + struct ioam6_namespace *ns; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) || + nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data)) + goto nla_put_failure; + + rcu_read_lock(); + + ns = rcu_dereference(sc->ns); + if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpsc_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->schemas, iter); + + return 0; +} + +static int ioam6_genl_dumpsc_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_schema *sc; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + sc = rhashtable_walk_next(iter); + + if (IS_ERR(sc)) { + if (PTR_ERR(sc) == -EAGAIN) + continue; + err = PTR_ERR(sc); + goto done; + } else if (!sc) { + break; + } + + err = __ioam6_genl_dumpsc_element(sc, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_SCHEMAS); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_namespace *ns, *ns_ref; + struct ioam6_schema *sc, *sc_ref; + struct ioam6_pernet_data *nsdata; + __be16 ns_id; + u32 sc_id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID] || + (!info->attrs[IOAM6_ATTR_SC_ID] && + !info->attrs[IOAM6_ATTR_SC_NONE])) + return -EINVAL; + + ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + if (info->attrs[IOAM6_ATTR_SC_NONE]) { + sc = NULL; + } else { + sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id, + rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + } + + sc_ref = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + if (sc_ref) + rcu_assign_pointer(sc_ref->ns, NULL); + rcu_assign_pointer(ns->schema, sc); + + if (sc) { + ns_ref = rcu_dereference_protected(sc->ns, + lockdep_is_held(&nsdata->lock)); + if (ns_ref) + rcu_assign_pointer(ns_ref->schema, NULL); + rcu_assign_pointer(sc->ns, ns); + } + + err = 0; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static const struct genl_ops ioam6_genl_ops[] = { + { + .cmd = IOAM6_CMD_ADD_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_NAMESPACES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpns_start, + .dumpit = ioam6_genl_dumpns, + .done = ioam6_genl_dumpns_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_ADD_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_SCHEMAS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpsc_start, + .dumpit = ioam6_genl_dumpsc, + .done = ioam6_genl_dumpsc_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_NS_SET_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_ns_set_schema, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_ns_sc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1, + }, +}; + +static struct genl_family ioam6_genl_family __ro_after_init = { + .name = IOAM6_GENL_NAME, + .version = IOAM6_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ioam6_genl_ops, + .n_ops = ARRAY_SIZE(ioam6_genl_ops), + .module = THIS_MODULE, +}; + +struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); +} + +static void __ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace, + struct ioam6_schema *sc, + u8 sclen) +{ + struct __kernel_sock_timeval ts; + u64 raw64; + u32 raw32; + u16 raw16; + u8 *data; + u8 byte; + + data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4; + + /* hop_lim and node_id */ + if (trace->type.bit0) { + byte = ipv6_hdr(skb)->hop_limit; + if (skb->dev) + byte--; + + raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + + *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); + data += sizeof(__be32); + } + + /* ingress_if_id and egress_if_id */ + if (trace->type.bit1) { + if (!skb->dev) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + } + + /* timestamp seconds */ + if (trace->type.bit2) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + skb_get_new_timestamp(skb, &ts); + *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); + } + data += sizeof(__be32); + } + + /* timestamp subseconds */ + if (trace->type.bit3) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + if (!trace->type.bit2) + skb_get_new_timestamp(skb, &ts); + + *(__be32 *)data = cpu_to_be32((u32)ts.tv_usec); + } + data += sizeof(__be32); + } + + /* transit delay */ + if (trace->type.bit4) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* namespace data */ + if (trace->type.bit5) { + *(__be32 *)data = ns->data; + data += sizeof(__be32); + } + + /* queue depth */ + if (trace->type.bit6) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* checksum complement */ + if (trace->type.bit7) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* hop_lim and node_id (wide) */ + if (trace->type.bit8) { + byte = ipv6_hdr(skb)->hop_limit; + if (skb->dev) + byte--; + + raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + + *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); + data += sizeof(__be64); + } + + /* ingress_if_id and egress_if_id (wide) */ + if (trace->type.bit9) { + if (!skb->dev) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + } + + /* namespace data (wide) */ + if (trace->type.bit10) { + *(__be64 *)data = ns->data_wide; + data += sizeof(__be64); + } + + /* buffer occupancy */ + if (trace->type.bit11) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* opaque state snapshot */ + if (trace->type.bit22) { + if (!sc) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8); + } else { + *(__be32 *)data = sc->hdr; + data += sizeof(__be32); + + memcpy(data, sc->data, sc->len); + } + } +} + +/* called with rcu_read_lock() */ +void ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace) +{ + struct ioam6_schema *sc; + u8 sclen = 0; + + /* Skip if Overflow flag is set OR + * if an unknown type (bit 12-21) is set + */ + if (trace->overflow || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) { + return; + } + + /* NodeLen does not include Opaque State Snapshot length. We need to + * take it into account if the corresponding bit is set (bit 22) and + * if the current IOAM namespace has an active schema attached to it + */ + sc = rcu_dereference(ns->schema); + if (trace->type.bit22) { + sclen = sizeof_field(struct ioam6_schema, hdr) / 4; + + if (sc) + sclen += sc->len / 4; + } + + /* If there is no space remaining, we set the Overflow flag and we + * skip without filling the trace + */ + if (!trace->remlen || trace->remlen < trace->nodelen + sclen) { + trace->overflow = 1; + return; + } + + __ioam6_fill_trace_data(skb, ns, trace, sc, sclen); + trace->remlen -= trace->nodelen + sclen; +} + +static int __net_init ioam6_net_init(struct net *net) +{ + struct ioam6_pernet_data *nsdata; + int err = -ENOMEM; + + nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL); + if (!nsdata) + goto out; + + mutex_init(&nsdata->lock); + net->ipv6.ioam6_data = nsdata; + + err = rhashtable_init(&nsdata->namespaces, &rht_ns_params); + if (err) + goto free_nsdata; + + err = rhashtable_init(&nsdata->schemas, &rht_sc_params); + if (err) + goto free_rht_ns; + +out: + return err; +free_rht_ns: + rhashtable_destroy(&nsdata->namespaces); +free_nsdata: + kfree(nsdata); + net->ipv6.ioam6_data = NULL; + goto out; +} + +static void __net_exit ioam6_net_exit(struct net *net) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL); + rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL); + + kfree(nsdata); +} + +static struct pernet_operations ioam6_net_ops = { + .init = ioam6_net_init, + .exit = ioam6_net_exit, +}; + +int __init ioam6_init(void) +{ + int err = register_pernet_subsys(&ioam6_net_ops); + if (err) + goto out; + + err = genl_register_family(&ioam6_genl_family); + if (err) + goto out_unregister_pernet_subsys; + +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + err = ioam6_iptunnel_init(); + if (err) + goto out_unregister_genl; +#endif + + pr_info("In-situ OAM (IOAM) with IPv6\n"); + +out: + return err; +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL +out_unregister_genl: + genl_unregister_family(&ioam6_genl_family); +#endif +out_unregister_pernet_subsys: + unregister_pernet_subsys(&ioam6_net_ops); + goto out; +} + +void ioam6_exit(void) +{ +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + ioam6_iptunnel_exit(); +#endif + genl_unregister_family(&ioam6_genl_family); + unregister_pernet_subsys(&ioam6_net_ops); +} diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c new file mode 100644 index 000000000000..f9ee04541c17 --- /dev/null +++ b/net/ipv6/ioam6_iptunnel.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM Lightweight Tunnel implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/netlink.h> +#include <linux/in6.h> +#include <linux/ioam6.h> +#include <linux/ioam6_iptunnel.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/lwtunnel.h> +#include <net/ioam6.h> + +#define IOAM6_MASK_SHORT_FIELDS 0xff100000 +#define IOAM6_MASK_WIDE_FIELDS 0xe00000 + +struct ioam6_lwt_encap { + struct ipv6_hopopt_hdr eh; + u8 pad[2]; /* 2-octet padding for 4n-alignment */ + struct ioam6_hdr ioamh; + struct ioam6_trace_hdr traceh; +} __packed; + +struct ioam6_lwt { + struct ioam6_lwt_encap tuninfo; +}; + +static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) +{ + return (struct ioam6_lwt *)lwt->data; +} + +static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) +{ + return &ioam6_lwt_state(lwt)->tuninfo; +} + +static struct ioam6_trace_hdr *ioam6_trace(struct lwtunnel_state *lwt) +{ + return &(ioam6_lwt_state(lwt)->tuninfo.traceh); +} + +static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), +}; + +static int nla_put_ioam6_trace(struct sk_buff *skb, int attrtype, + struct ioam6_trace_hdr *trace) +{ + struct ioam6_trace_hdr *data; + struct nlattr *nla; + int len; + + len = sizeof(*trace); + + nla = nla_reserve(skb, attrtype, len); + if (!nla) + return -EMSGSIZE; + + data = nla_data(nla); + memcpy(data, trace, len); + + return 0; +} + +static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) +{ + u32 fields; + + if (!trace->type_be32 || !trace->remlen || + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4) + return false; + + trace->nodelen = 0; + fields = be32_to_cpu(trace->type_be32); + + trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) + * (sizeof(__be32) / 4); + trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) + * (sizeof(__be64) / 4); + + return true; +} + +static int ioam6_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; + struct ioam6_lwt_encap *tuninfo; + struct ioam6_trace_hdr *trace; + struct lwtunnel_state *s; + int len_aligned; + int len, err; + + if (family != AF_INET6) + return -EINVAL; + + err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, + ioam6_iptunnel_policy, extack); + if (err < 0) + return err; + + if (!tb[IOAM6_IPTUNNEL_TRACE]) { + NL_SET_ERR_MSG(extack, "missing trace"); + return -EINVAL; + } + + trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]); + if (!ioam6_validate_trace_hdr(trace)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], + "invalid trace validation"); + return -EINVAL; + } + + len = sizeof(*tuninfo) + trace->remlen * 4; + len_aligned = ALIGN(len, 8); + + s = lwtunnel_state_alloc(len_aligned); + if (!s) + return -ENOMEM; + + tuninfo = ioam6_lwt_info(s); + tuninfo->eh.hdrlen = (len_aligned >> 3) - 1; + tuninfo->pad[0] = IPV6_TLV_PADN; + tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; + tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; + tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) + + trace->remlen * 4; + + memcpy(&tuninfo->traceh, trace, sizeof(*trace)); + + len = len_aligned - len; + if (len == 1) { + tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PAD1; + } else if (len > 0) { + tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; + tuninfo->traceh.data[trace->remlen * 4 + 1] = len - 2; + } + + s->type = LWTUNNEL_ENCAP_IOAM6; + s->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = s; + + return 0; +} + +static int ioam6_do_inline(struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo) +{ + struct ioam6_trace_hdr *trace; + struct ipv6hdr *oldhdr, *hdr; + struct ioam6_namespace *ns; + int hdrlen, err; + + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + + err = skb_cow_head(skb, hdrlen + skb->mac_len); + if (unlikely(err)) + return err; + + oldhdr = ipv6_hdr(skb); + skb_pull(skb, sizeof(*oldhdr)); + skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr)); + + skb_push(skb, sizeof(*oldhdr) + hdrlen); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + hdr = ipv6_hdr(skb); + memmove(hdr, oldhdr, sizeof(*oldhdr)); + tuninfo->eh.nexthdr = hdr->nexthdr; + + skb_set_transport_header(skb, sizeof(*hdr)); + skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen); + + memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); + + hdr->nexthdr = NEXTHDR_HOP; + hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); + + trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) + + sizeof(struct ipv6_hopopt_hdr) + 2 + + sizeof(struct ioam6_hdr)); + + ns = ioam6_namespace(dev_net(skb_dst(skb)->dev), trace->namespace_id); + if (ns) + ioam6_fill_trace_data(skb, ns, trace); + + return 0; +} + +static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct lwtunnel_state *lwt = skb_dst(skb)->lwtstate; + int err = -EINVAL; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + /* Only for packets we send and + * that do not contain a Hop-by-Hop yet + */ + if (skb->dev || ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) + goto out; + + err = ioam6_do_inline(skb, ioam6_lwt_info(lwt)); + if (unlikely(err)) + goto drop; + + err = skb_cow_head(skb, LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + goto drop; + +out: + return lwt->orig_output(net, sk, skb); + +drop: + kfree_skb(skb); + return err; +} + +static int ioam6_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + + if (nla_put_ioam6_trace(skb, IOAM6_IPTUNNEL_TRACE, trace)) + return -EMSGSIZE; + + return 0; +} + +static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct ioam6_trace_hdr *trace = ioam6_trace(lwtstate); + + return nla_total_size(sizeof(*trace)); +} + +static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct ioam6_trace_hdr *a_hdr = ioam6_trace(a); + struct ioam6_trace_hdr *b_hdr = ioam6_trace(b); + + return (a_hdr->namespace_id != b_hdr->namespace_id); +} + +static const struct lwtunnel_encap_ops ioam6_iptun_ops = { + .build_state = ioam6_build_state, + .output = ioam6_output, + .fill_encap = ioam6_fill_encap_info, + .get_encap_size = ioam6_encap_nlsize, + .cmp_encap = ioam6_encap_cmp, + .owner = THIS_MODULE, +}; + +int __init ioam6_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} + +void ioam6_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 679699e953f1..1bec5b22f80d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include <net/lwtunnel.h> #include <net/fib_notifier.h> +#include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> @@ -1340,7 +1341,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); - /* paired with smp_rmb() in rt6_get_cookie_safe() */ + /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { fn->fn_sernum = sernum; @@ -2355,6 +2356,10 @@ static int __net_init fib6_net_init(struct net *net) if (err) return err; + /* Default to 3-tuple */ + net->ipv6.sysctl.multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; + spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); @@ -2362,7 +2367,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) - goto out_timer; + goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); @@ -2407,7 +2412,7 @@ out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); -out_timer: +out_notifier: fib6_notifier_exit(net); return -ENOMEM; } @@ -2444,8 +2449,8 @@ int __init fib6_init(void) int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fib6_node), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index bc224f917bbd..3ad201d372d8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1244,8 +1244,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, memcpy(u->name, p->name, sizeof(u->name)); } -static int ip6gre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ip6gre_tunnel_siocdevprivate(struct net_device *dev, + struct ifreq *ifr, void __user *data, + int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -1259,7 +1260,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1270,7 +1271,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; @@ -1281,7 +1282,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, goto done; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -EINVAL; @@ -1318,7 +1319,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); @@ -1331,7 +1332,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, if (dev == ign->fb_tunnel_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -ENOENT; ip6gre_tnl_parm_from_user(&p1, &p); @@ -1398,7 +1399,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_init = ip6gre_tunnel_init, .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, - .ndo_do_ioctl = ip6gre_tunnel_ioctl, + .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff4f9ebcf7f6..12f985f43bcc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,18 +60,29 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; - const struct in6_addr *nexthop; + struct inet6_dev *idev = ip6_dst_idev(dst); + unsigned int hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *daddr, *nexthop; + struct ipv6hdr *hdr; struct neighbour *neigh; int ret; - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + /* Be paranoid, rather than too clever. */ + if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + skb = skb_expand_head(skb, hh_len); + if (!skb) { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + return -ENOMEM; + } + } + hdr = ipv6_hdr(skb); + daddr = &hdr->daddr; + if (ipv6_addr_is_multicast(daddr)) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || - ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr))) { + ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing @@ -82,7 +93,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); - if (ipv6_hdr(skb)->hop_limit == 0) { + if (hdr->hop_limit == 0) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); @@ -91,9 +102,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); - - if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= - IPV6_ADDR_SCOPE_NODELOCAL && + if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && !(dev->flags & IFF_LOOPBACK)) { kfree_skb(skb); return 0; @@ -108,10 +117,10 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); - neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) - neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); @@ -120,7 +129,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } rcu_read_unlock_bh(); - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } @@ -240,6 +249,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; @@ -247,22 +258,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int hlimit = -1; u32 mtu; - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; - if (unlikely(skb_headroom(skb) < head_room)) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); + if (unlikely(head_room > skb_headroom(skb))) { + skb = skb_expand_head(skb, head_room); + if (!skb) { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOBUFS; } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } if (opt) { @@ -304,8 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { - IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUT, skb->len); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -318,17 +322,17 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dst->dev, + net, (struct sock *)sk, skb, NULL, dev, dst_output); } - skb->dev = dst->dev; + skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } @@ -479,7 +483,9 @@ int ip6_forward(struct sk_buff *skb) if (skb_warn_if_lro(skb)) goto drop; - if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + if (!net->ipv6.devconf_all->disable_policy && + !idev->cnf.disable_policy && + !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -519,9 +525,10 @@ int ip6_forward(struct sk_buff *skb) if (net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); - if (proxied > 0) + if (proxied > 0) { + hdr->hop_limit--; return ip6_input(skb); - else if (proxied < 0) { + } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -577,7 +584,7 @@ int ip6_forward(struct sk_buff *skb) } } - mtu = ip6_dst_mtu_forward(dst); + mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; @@ -1055,13 +1062,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later. */ - if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt; - bool had_dst = *dst != NULL; - if (!had_dst) - *dst = ip6_route_output(net, sk, fl6); + *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; rcu_read_lock(); @@ -1078,7 +1083,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * never existed and let the SA-enabled version take * over. */ - if (!had_dst && (*dst)->error) { + if ((*dst)->error) { dst_release(*dst); *dst = NULL; } @@ -1555,7 +1560,7 @@ emsgsize: unsigned int datalen; unsigned int fraglen; unsigned int fraggap; - unsigned int alloclen; + unsigned int alloclen, alloc_extra; unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ @@ -1582,17 +1587,28 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; pagedlen = 0; + alloc_extra = hh_len; + alloc_extra += dst_exthdrlen; + alloc_extra += rt->dst.trailer_len; + + /* We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloc_extra += sizeof(struct frag_hdr); + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else if (!paged) + else if (!paged && + (fraglen + alloc_extra < SKB_MAX_ALLOC || + !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } - - alloclen += dst_exthdrlen; + alloclen += alloc_extra; if (datalen != length + fraggap) { /* @@ -1602,30 +1618,21 @@ alloc_new_skb: datalen += rt->dst.trailer_len; } - alloclen += rt->dst.trailer_len; fraglen = datalen + fragheaderlen; - /* - * We just reserve space for fragment header. - * Note: this may be overallocation if the message - * (without MSG_MORE) fits into the MTU. - */ - alloclen += sizeof(struct frag_hdr); - copy = datalen - transhdrlen - fraggap - pagedlen; if (copy < 0) { err = -EINVAL; goto error; } if (transhdrlen) { - skb = sock_alloc_send_skb(sk, - alloclen + hh_len, + skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = alloc_skb(alloclen + hh_len, + skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 288bafded998..20a67efda47f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -837,6 +837,7 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; + skb_reset_mac_header(skb); } skb_reset_network_header(skb); @@ -1239,8 +1240,6 @@ route_lookup: if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; - skb_set_inner_ipproto(skb, proto); - err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; @@ -1377,6 +1376,8 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + skb_set_inner_ipproto(skb, protocol); + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { @@ -1580,9 +1581,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) } /** - * ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: @@ -1608,7 +1610,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) **/ static int -ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; @@ -1622,7 +1625,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1634,9 +1637,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; - } break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: @@ -1644,7 +1646,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && @@ -1668,7 +1670,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { @@ -1682,7 +1684,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); @@ -1801,7 +1803,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 2d048e21abbb..1d8e3ffa225d 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -771,13 +771,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) } /** - * vti6_ioctl - configure vti6 tunnels from userspace + * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: - * vti6_ioctl() is used for managing vti6 tunnels + * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: @@ -798,7 +799,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) * %-ENODEV if attempting to change or delete a nonexisting device **/ static int -vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -810,7 +811,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -822,7 +823,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: @@ -831,7 +832,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) @@ -852,7 +853,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else @@ -865,7 +866,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); @@ -890,7 +891,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, - .ndo_do_ioctl = vti6_ioctl, + .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 06b0d2c329b9..36ed9efb8825 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -559,8 +559,7 @@ static int pim6_rcv(struct sk_buff *skb) read_lock(&mrt_lock); if (reg_vif_num >= 0) reg_dev = mrt->vif_table[reg_vif_num].dev; - if (reg_dev) - dev_hold(reg_dev); + dev_hold(reg_dev); read_unlock(&mrt_lock); if (!reg_dev) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index daef890460b7..15f984be3570 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -172,14 +172,12 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ipcomp6_type = { - .description = "IPCOMP6", .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp6_init_state, .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output, - .hdr_offset = xfrm6_find_1stfragopt, }; static struct xfrm6_protocol ipcomp6_protocol = { diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a6804a7e34c1..e4bdb09c5586 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -225,7 +225,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; - ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); + ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; @@ -234,7 +234,7 @@ out_free_gsf: static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; @@ -249,7 +249,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (!p) return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; @@ -261,14 +261,14 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, goto out_free_p; ret = -EINVAL; - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, - .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); @@ -1048,7 +1048,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, static int ipv6_get_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct group_filter, gf_slist); + const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter __user *p = optval; struct group_filter gsf; int num; @@ -1062,7 +1062,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval, return -EADDRNOTAVAIL; num = gsf.gf_numsrc; lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; @@ -1077,7 +1077,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval, static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, int __user *optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; @@ -1100,7 +1100,7 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, return -EADDRNOTAVAIL; lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist); + err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex); release_sock(sk); if (err) return err; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d36ef9d25e73..bed8155508c8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -447,7 +447,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -457,7 +458,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } psl = newpsl; @@ -525,8 +527,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, goto done; } if (gsf->gf_numsrc) { - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), - GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, + gsf->gf_numsrc), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -543,7 +546,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { mutex_unlock(&idev->mc_lock); - sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, + newpsl->sl_max)); goto done; } mutex_unlock(&idev->mc_lock); @@ -559,7 +563,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); @@ -1351,8 +1356,8 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, return 0; } -static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, - unsigned long *max_delay) +static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); @@ -1362,7 +1367,7 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, idev->mc_maxdelay = *max_delay; - return 0; + return; } /* called with rcu_read_lock() */ @@ -1449,9 +1454,7 @@ static void __mld_query_work(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); - err = mld_process_v2(idev, mlh2, &max_delay); - if (err < 0) - goto out; + mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) @@ -1729,22 +1732,25 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { + u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, + 2, 0, 0, IPV6_TLV_PADN, 0 }; struct net_device *dev = idev->dev; - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; - struct sk_buff *skb; - struct mld2_report *pmr; - struct in6_addr addr_buf; - const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu + hlen + tlen; + struct net *net = dev_net(dev); + const struct in6_addr *saddr; + struct in6_addr addr_buf; + struct mld2_report *pmr; + struct sk_buff *skb; + unsigned int size; + struct sock *sk; int err; - u8 ra[8] = { IPPROTO_ICMPV6, 0, - IPV6_TLV_ROUTERALERT, 2, 0, 0, - IPV6_TLV_PADN, 0 }; - /* we assume size > sizeof(ra) here */ + sk = net->ipv6.igmp_sk; + /* we assume size > sizeof(ra) here + * Also try to not allocate high-order pages for big MTU + */ + size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; @@ -2604,7 +2610,8 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 878fcec14949..aeb35d26e474 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -247,54 +247,6 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, return err; } -static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) -{ - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); - int found_rhdr = 0; - - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - - switch (**nexthdr) { - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - found_rhdr = 1; - break; - case NEXTHDR_DEST: - /* - * HAO MUST NOT appear more than once. - * XXX: It is better to try to find by the end of - * XXX: packet if HAO exists. - */ - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { - net_dbg_ratelimited("mip6: hao exists already, override\n"); - return offset; - } - - if (found_rhdr) - return offset; - - break; - default: - return offset; - } - - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - } - - return offset; -} - static int mip6_destopt_init_state(struct xfrm_state *x) { if (x->id.spi) { @@ -324,7 +276,6 @@ static void mip6_destopt_destroy(struct xfrm_state *x) } static const struct xfrm_type mip6_destopt_type = { - .description = "MIP6DESTOPT", .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, @@ -333,7 +284,6 @@ static const struct xfrm_type mip6_destopt_type = { .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, - .hdr_offset = mip6_destopt_offset, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) @@ -383,53 +333,6 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, - u8 **nexthdr) -{ - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb_tail_pointer(skb) - - skb_network_header(skb); - int found_rhdr = 0; - - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - - switch (**nexthdr) { - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - if (offset + 3 <= packet_len) { - struct ipv6_rt_hdr *rt; - rt = (struct ipv6_rt_hdr *)(nh + offset); - if (rt->type != 0) - return offset; - } - found_rhdr = 1; - break; - case NEXTHDR_DEST: - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) - return offset; - - if (found_rhdr) - return offset; - - break; - default: - return offset; - } - - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - } - - return offset; -} - static int mip6_rthdr_init_state(struct xfrm_state *x) { if (x->id.spi) { @@ -456,7 +359,6 @@ static void mip6_rthdr_destroy(struct xfrm_state *x) } static const struct xfrm_type mip6_rthdr_type = { - .description = "MIP6RT", .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, @@ -464,7 +366,6 @@ static const struct xfrm_type mip6_rthdr_type = { .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, - .hdr_offset = mip6_rthdr_offset, }; static int __init mip6_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c467c6419893..4b098521a44c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1391,12 +1391,6 @@ skip_defrtr: } } - /* - * Send a notify if RA changed managed/otherconf flags or timer settings - */ - if (send_ifinfo_notify) - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); - skip_linkparms: /* @@ -1496,6 +1490,11 @@ skip_routeinfo: memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); + if (in6_dev->ra_mtu != mtu) { + in6_dev->ra_mtu = mtu; + send_ifinfo_notify = true; + } + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { @@ -1519,6 +1518,12 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: + /* Send a notify if RA changed managed/otherconf flags or + * timer settings or ra_mtu value + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + fib6_info_release(rt); if (neigh) neigh_release(neigh); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e810a23baf99..de2cf3943b91 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -51,7 +51,7 @@ ip6_packet_match(const struct sk_buff *skb, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, - int *fragoff, bool *hotdrop) + u16 *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index bb784ea7bbd3..727ee8097012 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -19,15 +19,12 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static int __net_init ip6table_filter_table_init(struct net *net); - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, - .table_init = ip6table_filter_table_init, }; /* The work comes in here from netfilter.c. */ @@ -44,7 +41,7 @@ static struct nf_hook_ops *filter_ops __read_mostly; static bool forward = true; module_param(forward, bool, 0000); -static int __net_init ip6table_filter_table_init(struct net *net) +static int ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; int err; @@ -63,7 +60,7 @@ static int __net_init ip6table_filter_table_init(struct net *net) static int __net_init ip6table_filter_net_init(struct net *net) { - if (net == &init_net || !forward) + if (!forward) return ip6table_filter_table_init(net); return 0; @@ -87,15 +84,24 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + ip6table_filter_table_init); + + if (ret < 0) + return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); - if (IS_ERR(filter_ops)) + if (IS_ERR(filter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); + } ret = register_pernet_subsys(&ip6table_filter_net_ops); - if (ret < 0) + if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(filter_ops); + return ret; + } return ret; } @@ -103,6 +109,7 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { unregister_pernet_subsys(&ip6table_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index c76cffd63041..9b518ce37d6a 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -20,15 +20,12 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static int __net_init ip6table_mangle_table_init(struct net *net); - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, - .table_init = ip6table_mangle_table_init, }; static unsigned int @@ -76,7 +73,7 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init ip6table_mangle_table_init(struct net *net) +static int ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -106,29 +103,32 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret; + int ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); + + if (ret < 0) + return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) + if (IS_ERR(mangle_ops)) { + xt_unregister_template(&packet_mangler); return PTR_ERR(mangle_ops); + } ret = register_pernet_subsys(&ip6table_mangle_net_ops); if (ret < 0) { + xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } - ret = ip6table_mangle_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); - kfree(mangle_ops); - } return ret; } static void __exit ip6table_mangle_fini(void) { unregister_pernet_subsys(&ip6table_mangle_net_ops); + xt_unregister_template(&packet_mangler); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index b0292251e655..921c1723a01e 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -19,8 +19,6 @@ struct ip6table_nat_pernet { struct nf_hook_ops *nf_nat_ops; }; -static int __net_init ip6table_nat_table_init(struct net *net); - static unsigned int ip6table_nat_net_id __read_mostly; static const struct xt_table nf_nat_ipv6_table = { @@ -31,7 +29,6 @@ static const struct xt_table nf_nat_ipv6_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV6, - .table_init = ip6table_nat_table_init, }; static unsigned int ip6table_nat_do_chain(void *priv, @@ -115,7 +112,7 @@ static void ip6t_nat_unregister_lookups(struct net *net) kfree(ops); } -static int __net_init ip6table_nat_table_init(struct net *net) +static int ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -157,20 +154,23 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = register_pernet_subsys(&ip6table_nat_net_ops); + int ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); - if (ret) + if (ret < 0) return ret; - ret = ip6table_nat_table_init(&init_net); + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret) - unregister_pernet_subsys(&ip6table_nat_net_ops); + xt_unregister_template(&nf_nat_ipv6_table); + return ret; } static void __exit ip6table_nat_exit(void) { unregister_pernet_subsys(&ip6table_nat_net_ops); + xt_unregister_template(&nf_nat_ipv6_table); } module_init(ip6table_nat_init); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index f63c106c521e..4f2a04af71d3 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -11,8 +11,6 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static int __net_init ip6table_raw_table_init(struct net *net); - static bool raw_before_defrag __read_mostly; MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); module_param(raw_before_defrag, bool, 0000); @@ -23,7 +21,6 @@ static const struct xt_table packet_raw = { .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, - .table_init = ip6table_raw_table_init, }; static const struct xt_table packet_raw_before_defrag = { @@ -32,7 +29,6 @@ static const struct xt_table packet_raw_before_defrag = { .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, - .table_init = ip6table_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -45,7 +41,7 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init ip6table_raw_table_init(struct net *net) +static int ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; const struct xt_table *table = &packet_raw; @@ -79,37 +75,39 @@ static struct pernet_operations ip6table_raw_net_ops = { static int __init ip6table_raw_init(void) { - int ret; const struct xt_table *table = &packet_raw; + int ret; if (raw_before_defrag) { table = &packet_raw_before_defrag; - pr_info("Enabling raw table before defrag\n"); } + ret = xt_register_template(table, ip6table_raw_table_init); + if (ret < 0) + return ret; + /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); - if (IS_ERR(rawtable_ops)) + if (IS_ERR(rawtable_ops)) { + xt_unregister_template(table); return PTR_ERR(rawtable_ops); + } ret = register_pernet_subsys(&ip6table_raw_net_ops); if (ret < 0) { kfree(rawtable_ops); + xt_unregister_template(table); return ret; } - ret = ip6table_raw_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_raw_net_ops); - kfree(rawtable_ops); - } return ret; } static void __exit ip6table_raw_fini(void) { unregister_pernet_subsys(&ip6table_raw_net_ops); + xt_unregister_template(&packet_raw); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 8dc335cf450b..931674034d8b 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -24,15 +24,12 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static int __net_init ip6table_security_table_init(struct net *net); - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, - .table_init = ip6table_security_table_init, }; static unsigned int @@ -44,7 +41,7 @@ ip6table_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init ip6table_security_table_init(struct net *net) +static int ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -74,29 +71,32 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret; + int ret = xt_register_template(&security_table, + ip6table_security_table_init); + + if (ret < 0) + return ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); - if (IS_ERR(sectbl_ops)) + if (IS_ERR(sectbl_ops)) { + xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); + } ret = register_pernet_subsys(&ip6table_security_net_ops); if (ret < 0) { kfree(sectbl_ops); + xt_unregister_template(&security_table); return ret; } - ret = ip6table_security_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_security_net_ops); - kfree(sectbl_ops); - } return ret; } static void __exit ip6table_security_fini(void) { unregister_pernet_subsys(&ip6table_security_net_ops); + xt_unregister_template(&security_table); kfree(sectbl_ops); } diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 6fd54744cbc3..aa5bb8789ba0 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, { __be16 dport, sport; const struct in6_addr *daddr = NULL, *saddr = NULL; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; @@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { - struct ipv6hdr ipv6_var; - if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) return NULL; diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 7969d1f3018d..ed69c768797e 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index af36acc1a644..2880dc7d9a49 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -15,29 +15,11 @@ static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { - const struct { - struct in6_addr dst; - struct in6_addr src; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .dst = *dst, - .src = *src, - }; - u32 hash, id; - - /* Note the following code is not safe, but this is okay. */ - if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) - get_random_bytes(&net->ipv4.ip_id_key, - sizeof(net->ipv4.ip_id_key)); - - hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); - - /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, - * set the hight order instead thus minimizing possible future - * collisions. - */ - id = ip_idents_reserve(hash, 1); - if (unlikely(!id)) - id = 1 << 31; + u32 id; + + do { + id = prandom_u32(); + } while (!id); return id; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index bf3646b57c68..60f1e4f5be5a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -354,7 +354,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, if (np->recverr || harderr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d417e514bd52..dbc224023977 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -41,6 +41,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/jhash.h> +#include <linux/siphash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> @@ -1484,17 +1485,24 @@ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { - static u32 seed __read_mostly; - u32 val; + static siphash_key_t rt6_exception_key __read_mostly; + struct { + struct in6_addr dst; + struct in6_addr src; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .dst = *dst, + }; + u64 val; - net_get_random_once(&seed, sizeof(seed)); - val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed); + net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) - val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val); + combined.src = *src; #endif - return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); + val = siphash(&combined, sizeof(combined), &rt6_exception_key); + + return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table @@ -1649,6 +1657,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; + int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); @@ -1703,7 +1712,9 @@ static int rt6_insert_exception(struct rt6_info *nrt, bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; - if (bucket->depth > FIB6_MAX_DEPTH) + /* Randomize max depth to avoid some side channels attacks. */ + max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); + while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: @@ -2326,12 +2337,131 @@ out: } } +static u32 rt6_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 rt6_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 rt6_multipath_custom_hash_fl6(const struct net *net, + const struct flowi6 *fl6) +{ + u32 hash_fields = ip6_multipath_hash_fields(net); + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v6addrs.src = fl6->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v6addrs.dst = fl6->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl6->flowi6_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl6->fl6_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: @@ -2345,6 +2475,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: if (skb) { @@ -2376,6 +2507,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2412,9 +2544,15 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); + break; + case 3: + if (skb) + mhash = rt6_multipath_custom_hash_skb(net, skb); + else + mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } - mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } @@ -3074,25 +3212,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { - struct inet6_dev *idev; - unsigned int mtu; - - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - goto out; - - mtu = IPV6_MIN_MTU; - - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - -out: - mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); @@ -3517,8 +3637,7 @@ out: if (err) { lwtstate_put(fib6_nh->fib_nh_lws); fib6_nh->fib_nh_lws = NULL; - if (dev) - dev_put(dev); + dev_put(dev); } return err; @@ -3642,7 +3761,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, err = PTR_ERR(rt->fib6_metrics); /* Do not leave garbage there. */ rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out; + goto out_free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -6511,7 +6630,7 @@ int __init ip6_route_init(void) ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 897fa59c47de..3adc5d9211ad 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -26,6 +26,7 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif +#include <linux/netfilter.h> static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { @@ -295,11 +296,19 @@ static int seg6_do_srh(struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + nf_reset_ct(skb); return 0; } -static int seg6_input(struct sk_buff *skb) +static int seg6_input_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dst_input(skb); +} + +static int seg6_input_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; @@ -337,15 +346,46 @@ static int seg6_input(struct sk_buff *skb) if (unlikely(err)) return err; - return dst_input(skb); + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, seg6_input_finish); + + return seg6_input_finish(dev_net(skb->dev), NULL, skb); } -static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int seg6_input_nf(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct net *net = dev_net(skb->dev); + + switch (skb->protocol) { + case htons(ETH_P_IP): + return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL, + skb, NULL, dev, seg6_input_core); + case htons(ETH_P_IPV6): + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL, + skb, NULL, dev, seg6_input_core); + } + + return -EINVAL; +} + +static int seg6_input(struct sk_buff *skb) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return seg6_input_nf(skb); + + return seg6_input_core(dev_net(skb->dev), NULL, skb); +} + +static int seg6_output_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; - int err = -EINVAL; + int err; err = seg6_do_srh(skb); if (unlikely(err)) @@ -387,12 +427,40 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(err)) goto drop; + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, + NULL, skb_dst(skb)->dev, dst_output); + return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } +static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + + switch (skb->protocol) { + case htons(ETH_P_IP): + return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, + NULL, dev, seg6_output_core); + case htons(ETH_P_IPV6): + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, + NULL, dev, seg6_output_core); + } + + return -EINVAL; +} + +static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return seg6_output_nf(net, sk, skb); + + return seg6_output_core(net, sk, skb); +} + static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 4ff38cb08f4b..2dc40b3f373e 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -30,6 +30,7 @@ #include <net/seg6_local.h> #include <linux/etherdevice.h> #include <linux/bpf.h> +#include <linux/netfilter.h> #define SEG6_F_ATTR(i) BIT(i) @@ -87,10 +88,10 @@ struct seg6_end_dt_info { int vrf_ifindex; int vrf_table; - /* tunneled packet proto and family (IPv4 or IPv6) */ - __be16 proto; + /* tunneled packet family (IPv4 or IPv6). + * Protocol and header length are inferred from family. + */ u16 family; - int hdrlen; }; struct pcpu_seg6_local_counters { @@ -413,12 +414,33 @@ drop: return -EINVAL; } +static int input_action_end_dx6_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct in6_addr *nhaddr = NULL; + struct seg6_local_lwt *slwt; + + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); + + /* The inner packet is not associated to any local interface, + * so we do not call netif_rx(). + * + * If slwt->nh6 is set to ::, then lookup the nexthop for the + * inner packet's DA. Otherwise, use the specified nexthop. + */ + if (!ipv6_addr_any(&slwt->nh6)) + nhaddr = &slwt->nh6; + + seg6_lookup_nexthop(skb, nhaddr, 0); + + return dst_input(skb); +} + /* decapsulate and forward to specified nexthop */ static int input_action_end_dx6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct in6_addr *nhaddr = NULL; - /* this function accepts IPv6 encapsulated packets, with either * an SRH with SL=0, or no SRH. */ @@ -429,40 +451,30 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - /* The inner packet is not associated to any local interface, - * so we do not call netif_rx(). - * - * If slwt->nh6 is set to ::, then lookup the nexthop for the - * inner packet's DA. Otherwise, use the specified nexthop. - */ - - if (!ipv6_addr_any(&slwt->nh6)) - nhaddr = &slwt->nh6; - skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + nf_reset_ct(skb); - seg6_lookup_nexthop(skb, nhaddr, 0); + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, input_action_end_dx6_finish); - return dst_input(skb); + return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return -EINVAL; } -static int input_action_end_dx4(struct sk_buff *skb, - struct seg6_local_lwt *slwt) +static int input_action_end_dx4_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { + struct dst_entry *orig_dst = skb_dst(skb); + struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; int err; - if (!decap_and_validate(skb, IPPROTO_IPIP)) - goto drop; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - - skb->protocol = htons(ETH_P_IP); + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); iph = ip_hdr(skb); @@ -470,14 +482,34 @@ static int input_action_end_dx4(struct sk_buff *skb, skb_dst_drop(skb); - skb_set_transport_header(skb, sizeof(struct iphdr)); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) - goto drop; + if (err) { + kfree_skb(skb); + return -EINVAL; + } return dst_input(skb); +} + +static int input_action_end_dx4(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + if (!decap_and_validate(skb, IPPROTO_IPIP)) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + skb->protocol = htons(ETH_P_IP); + skb_set_transport_header(skb, sizeof(struct iphdr)); + nf_reset_ct(skb); + + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, input_action_end_dx4_finish); + + return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return -EINVAL; @@ -521,19 +553,6 @@ static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg, info->net = net; info->vrf_ifindex = vrf_ifindex; - switch (family) { - case AF_INET: - info->proto = htons(ETH_P_IP); - info->hdrlen = sizeof(struct iphdr); - break; - case AF_INET6: - info->proto = htons(ETH_P_IPV6); - info->hdrlen = sizeof(struct ipv6hdr); - break; - default: - return -EINVAL; - } - info->family = family; info->mode = DT_VRF_MODE; @@ -622,22 +641,45 @@ error: } static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, u16 family) { struct seg6_end_dt_info *info = &slwt->dt_info; struct net_device *vrf; + __be16 protocol; + int hdrlen; vrf = end_dt_get_vrf_rcu(skb, info); if (unlikely(!vrf)) goto drop; - skb->protocol = info->proto; + switch (family) { + case AF_INET: + protocol = htons(ETH_P_IP); + hdrlen = sizeof(struct iphdr); + break; + case AF_INET6: + protocol = htons(ETH_P_IPV6); + hdrlen = sizeof(struct ipv6hdr); + break; + case AF_UNSPEC: + fallthrough; + default: + goto drop; + } + + if (unlikely(info->family != AF_UNSPEC && info->family != family)) { + pr_warn_once("seg6local: SRv6 End.DT* family mismatch"); + goto drop; + } + + skb->protocol = protocol; skb_dst_drop(skb); - skb_set_transport_header(skb, info->hdrlen); + skb_set_transport_header(skb, hdrlen); + nf_reset_ct(skb); - return end_dt_vrf_rcv(skb, info->family, vrf); + return end_dt_vrf_rcv(skb, family, vrf); drop: kfree_skb(skb); @@ -656,7 +698,7 @@ static int input_action_end_dt4(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -739,7 +781,7 @@ static int input_action_end_dt6(struct sk_buff *skb, goto legacy_mode; /* DT6_VRF_MODE */ - skb = end_dt_vrf_core(skb, slwt); + skb = end_dt_vrf_core(skb, slwt, AF_INET6); if (!skb) /* packet has been processed and consumed by the VRF */ return 0; @@ -767,6 +809,36 @@ drop: return -EINVAL; } +#ifdef CONFIG_NET_L3_MASTER_DEV +static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg, + struct netlink_ext_ack *extack) +{ + return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack); +} + +static int input_action_end_dt46(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + unsigned int off = 0; + int nexthdr; + + nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL); + if (unlikely(nexthdr < 0)) + goto drop; + + switch (nexthdr) { + case IPPROTO_IPIP: + return input_action_end_dt4(skb, slwt); + case IPPROTO_IPV6: + return input_action_end_dt6(skb, slwt); + } + +drop: + kfree_skb(skb); + return -EINVAL; +} +#endif + /* push an SRH on top of the current one */ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -969,6 +1041,17 @@ static struct seg6_action_desc seg6_action_table[] = { .input = input_action_end_dt6, }, { + .action = SEG6_LOCAL_ACTION_END_DT46, + .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, +#ifdef CONFIG_NET_L3_MASTER_DEV + .input = input_action_end_dt46, + .slwt_ops = { + .build_state = seg6_end_dt46_build, + }, +#endif + }, + { .action = SEG6_LOCAL_ACTION_END_B6, .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), .optattrs = SEG6_F_LOCAL_COUNTERS, @@ -1028,7 +1111,8 @@ static void seg6_local_update_counters(struct seg6_local_lwt *slwt, u64_stats_update_end(&pcounters->syncp); } -static int seg6_local_input(struct sk_buff *skb) +static int seg6_local_input_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct seg6_action_desc *desc; @@ -1036,11 +1120,6 @@ static int seg6_local_input(struct sk_buff *skb) unsigned int len = skb->len; int rc; - if (skb->protocol != htons(ETH_P_IPV6)) { - kfree_skb(skb); - return -EINVAL; - } - slwt = seg6_local_lwtunnel(orig_dst->lwtstate); desc = slwt->desc; @@ -1054,6 +1133,21 @@ static int seg6_local_input(struct sk_buff *skb) return rc; } +static int seg6_local_input(struct sk_buff *skb) +{ + if (skb->protocol != htons(ETH_P_IPV6)) { + kfree_skb(skb); + return -EINVAL; + } + + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + seg6_local_input_core); + + return seg6_local_input_core(dev_net(skb->dev), NULL, skb); +} + static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_ACTION] = { .type = NLA_U32 }, [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index f7c8110ece5f..ef0c7a7c18e2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -299,9 +299,8 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { - struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data; struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; @@ -321,7 +320,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) : + kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; rcu_read_lock(); @@ -334,7 +333,8 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * For root users, retry allocating enough memory for * the answer. */ - kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | + __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; @@ -453,8 +453,8 @@ out: return err; } -static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, - int cmd) +static int ipip6_tunnel_prl_ctl(struct net_device *dev, + struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; @@ -465,7 +465,7 @@ static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { @@ -710,6 +710,8 @@ static int ipip6_rcv(struct sk_buff *skb) * old iph is no longer valid */ iph = (const struct iphdr *)skb_mac_header(skb); + skb_reset_mac_header(skb); + err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -780,6 +782,8 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; + skb_reset_mac_header(skb); + return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } @@ -973,7 +977,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; - if (mtu < 68) { + if (mtu < IPV4_MIN_MTU) { dev->stats.collisions++; ip_rt_put(rt); goto tx_error; @@ -1193,14 +1197,14 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } static int -ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1211,13 +1215,14 @@ ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd))) + if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int -ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, + int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; @@ -1225,7 +1230,7 @@ ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd))) + if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { @@ -1364,27 +1369,28 @@ ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } static int -ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: - return ip_tunnel_ioctl(dev, ifr, cmd); + return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: - return ipip6_tunnel_get_prl(dev, ifr); + return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: - return ipip6_tunnel_prl_ctl(dev, ifr, cmd); + return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: - return ipip6_tunnel_get6rd(dev, ifr); + return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: - return ipip6_tunnel_6rdctl(dev, ifr, cmd); + return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; @@ -1395,7 +1401,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, - .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 27102c3d6e1d..d53dd142bf87 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,13 +17,20 @@ #include <net/addrconf.h> #include <net/inet_frag.h> #include <net/netevent.h> +#include <net/ip_fib.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif +#include <linux/ioam6.h> static int two = 2; +static int three = 3; static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static u32 rt6_multipath_hash_fields_all_mask = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; +static u32 ioam6_id_max = IOAM6_DEFAULT_ID; +static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -40,6 +47,22 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, return ret; } +static int +proc_rt6_multipath_hash_fields(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -149,7 +172,16 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, + }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv6.sysctl.multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_fields, + .extra1 = SYSCTL_ONE, + .extra2 = &rt6_multipath_hash_fields_all_mask, }, { .procname = "seg6_flowlabel", @@ -167,6 +199,22 @@ static struct ctl_table ipv6_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "ioam6_id", + .data = &init_net.ipv6.sysctl.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = &ioam6_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &init_net.ipv6.sysctl.ioam6_id_wide, + .maxlen = sizeof(u64), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra2 = &ioam6_id_wide_max, + }, { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f47c0b6e3de..0ce52d46e4f8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -348,11 +348,20 @@ failure: static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; + u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; - dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + mtu = READ_ONCE(tcp_sk(sk)->mtu_info); + + /* Drop requests trying to increase our current mss. + * Check done in __ip6_rt_update_pmtu() is too late. + */ + if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) + return; + + dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -433,6 +442,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { + u32 mtu = ntohl(info); + /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). @@ -443,7 +454,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!ip6_sk_accept_pmtu(sk)) goto out; - tp->mtu_info = ntohl(info); + if (mtu < IPV6_MIN_MTU) + goto out; + + WRITE_ONCE(tp->mtu_info, mtu); + if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, @@ -467,7 +482,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk)) { sk->sk_err = err; - sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ tcp_done(sk); } else @@ -486,7 +501,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else sk->sk_err_soft = err; @@ -540,7 +555,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, + err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt, tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); @@ -1538,6 +1553,7 @@ discard: kfree_skb(skb); return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1663,10 +1679,18 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + sock_hold(sk); } - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -1754,6 +1778,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3fcd86f4dfdc..ea53847b5b7e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -502,12 +502,14 @@ static struct sock *__udp6_lib_err_encap(struct net *net, const struct ipv6hdr *hdr, int offset, struct udphdr *uh, struct udp_table *udptable, + struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, __be32 info) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; - struct sock *sk; + struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); @@ -518,18 +520,28 @@ static struct sock *__udp6_lib_err_encap(struct net *net, /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, offset); + if (sk) { + up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (lookup && lookup(sk, skb)) + sk = NULL; + + goto out; + } + sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, &hdr->saddr, uh->dest, inet6_iif(skb), 0, udptable, skb); if (sk) { - int (*lookup)(struct sock *sk, struct sk_buff *skb); - struct udp_sock *up = udp_sk(sk); + up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } +out: if (!sk) { sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code, offset, info)); @@ -558,16 +570,17 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); + if (!sk || udp_sk(sk)->encap_type) { /* No socket for error: try tunnels before discarding */ - sk = ERR_PTR(-ENOENT); if (static_branch_unlikely(&udpv6_encap_needed_key)) { sk = __udp6_lib_err_encap(net, hdr, offset, uh, - udptable, skb, + udptable, sk, skb, opt, type, code, info); if (!sk) return 0; - } + } else + sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), @@ -610,7 +623,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: return 0; } @@ -1296,7 +1309,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); ipcm6_init(&ipc6); - ipc6.gso_size = up->gso_size; + ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = sk->sk_tsflags; ipc6.sockc.mark = sk->sk_mark; @@ -1462,7 +1475,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8b84d534b19d..d0d280077721 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -16,13 +16,6 @@ #include <net/ip6_route.h> #include <net/xfrm.h> -int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, - u8 **prevhdr) -{ - return ip6_find_1stfragopt(skb, prevhdr); -} -EXPORT_SYMBOL(xfrm6_find_1stfragopt); - void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; @@ -56,7 +49,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; - int mtu; + unsigned int mtu; bool toobig; #ifdef CONFIG_NETFILTER diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f696d46e6910..2b31112c0856 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -291,7 +291,6 @@ static void xfrm6_tunnel_destroy(struct xfrm_state *x) } static const struct xfrm_type xfrm6_tunnel_type = { - .description = "IP6IP6", .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 0fdb389c3390..18316ee3c692 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -44,6 +44,7 @@ static struct proto iucv_proto = { }; static struct iucv_interface *pr_iucv; +static struct iucv_handler af_iucv_handler; /* special AF_IUCV IPRM messages */ static const u8 iprm_shutdown[8] = @@ -91,28 +92,11 @@ static void iucv_sock_close(struct sock *sk); static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify); -/* Call Back functions */ -static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); -static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); -static void iucv_callback_connack(struct iucv_path *, u8 *); -static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *); -static void iucv_callback_connrej(struct iucv_path *, u8 *); -static void iucv_callback_shutdown(struct iucv_path *, u8 *); - static struct iucv_sock_list iucv_sk_list = { .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), .autobind_name = ATOMIC_INIT(0) }; -static struct iucv_handler af_iucv_handler = { - .path_pending = iucv_callback_connreq, - .path_complete = iucv_callback_connack, - .path_severed = iucv_callback_connrej, - .message_pending = iucv_callback_rx, - .message_complete = iucv_callback_txdone, - .path_quiesced = iucv_callback_shutdown, -}; - static inline void high_nmcpy(unsigned char *dst, char *src) { memcpy(dst, src, 8); @@ -1060,7 +1044,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (err == 0) { atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); - kfree_skb(skb); + consume_skb(skb); } /* this error should never happen since the */ @@ -1309,7 +1293,7 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, } } - kfree_skb(skb); + consume_skb(skb); if (iucv->transport == AF_IUCV_TRANS_HIPER) { atomic_inc(&iucv->msg_recv); if (atomic_read(&iucv->msg_recv) > iucv->msglimit) { @@ -1772,7 +1756,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_unlock_irqrestore(&list->lock, flags); if (this) { - kfree_skb(this); + consume_skb(this); /* wake up any process waiting for sending */ iucv_sock_wake_msglim(sk); } @@ -1817,6 +1801,15 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16]) bh_unlock_sock(sk); } +static struct iucv_handler af_iucv_handler = { + .path_pending = iucv_callback_connreq, + .path_complete = iucv_callback_connack, + .path_severed = iucv_callback_connrej, + .message_pending = iucv_callback_rx, + .message_complete = iucv_callback_txdone, + .path_quiesced = iucv_callback_shutdown, +}; + /***************** HiperSockets transport callbacks ********************/ static void afiucv_swap_src_dest(struct sk_buff *skb) { @@ -1910,17 +1903,17 @@ static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - if (!iucv) - goto out; - if (sk->sk_state != IUCV_BOUND) - goto out; + if (!iucv || sk->sk_state != IUCV_BOUND) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); iucv->msglimit_peer = iucv_trans_hdr(skb)->window; sk->sk_state = IUCV_CONNECTED; sk->sk_state_change(sk); bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -1931,16 +1924,16 @@ static int afiucv_hs_callback_synfin(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - if (!iucv) - goto out; - if (sk->sk_state != IUCV_BOUND) - goto out; + if (!iucv || sk->sk_state != IUCV_BOUND) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); sk->sk_state = IUCV_DISCONN; sk->sk_state_change(sk); bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -1952,16 +1945,18 @@ static int afiucv_hs_callback_fin(struct sock *sk, struct sk_buff *skb) struct iucv_sock *iucv = iucv_sk(sk); /* other end of connection closed */ - if (!iucv) - goto out; + if (!iucv) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); if (sk->sk_state == IUCV_CONNECTED) { sk->sk_state = IUCV_DISCONN; sk->sk_state_change(sk); } bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -2114,7 +2109,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, case (AF_IUCV_FLAG_WIN): err = afiucv_hs_callback_win(sk, skb); if (skb->len == sizeof(struct af_iucv_trans_hdr)) { - kfree_skb(skb); + consume_skb(skb); break; } fallthrough; /* and receive non-zero length data */ @@ -2269,21 +2264,11 @@ static struct packet_type iucv_packet_type = { .func = afiucv_hs_rcv, }; -static int afiucv_iucv_init(void) -{ - return pr_iucv->iucv_register(&af_iucv_handler, 0); -} - -static void afiucv_iucv_exit(void) -{ - pr_iucv->iucv_unregister(&af_iucv_handler, 0); -} - static int __init afiucv_init(void) { int err; - if (MACHINE_IS_VM) { + if (MACHINE_IS_VM && IS_ENABLED(CONFIG_IUCV)) { cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err); if (unlikely(err)) { WARN_ON(err); @@ -2291,11 +2276,7 @@ static int __init afiucv_init(void) goto out; } - pr_iucv = try_then_request_module(symbol_get(iucv_if), "iucv"); - if (!pr_iucv) { - printk(KERN_WARNING "iucv_if lookup failed\n"); - memset(&iucv_userid, 0, sizeof(iucv_userid)); - } + pr_iucv = &iucv_if; } else { memset(&iucv_userid, 0, sizeof(iucv_userid)); pr_iucv = NULL; @@ -2309,7 +2290,7 @@ static int __init afiucv_init(void) goto out_proto; if (pr_iucv) { - err = afiucv_iucv_init(); + err = pr_iucv->iucv_register(&af_iucv_handler, 0); if (err) goto out_sock; } @@ -2323,23 +2304,19 @@ static int __init afiucv_init(void) out_notifier: if (pr_iucv) - afiucv_iucv_exit(); + pr_iucv->iucv_unregister(&af_iucv_handler, 0); out_sock: sock_unregister(PF_IUCV); out_proto: proto_unregister(&iucv_proto); out: - if (pr_iucv) - symbol_put(iucv_if); return err; } static void __exit afiucv_exit(void) { - if (pr_iucv) { - afiucv_iucv_exit(); - symbol_put(iucv_if); - } + if (pr_iucv) + pr_iucv->iucv_unregister(&af_iucv_handler, 0); unregister_netdevice_notifier(&afiucv_netdev_notifier); dev_remove_pack(&iucv_packet_type); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 349c6ac3313f..f3343a8541a5 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -286,19 +286,19 @@ static union iucv_param *iucv_param_irq[NR_CPUS]; */ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { - register unsigned long reg0 asm ("0"); - register unsigned long reg1 asm ("1"); - int ccode; + int cc; - reg0 = command; - reg1 = (unsigned long)parm; asm volatile( - " .long 0xb2f01000\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1) - : "m" (*parm) : "cc"); - return ccode; + " lgr 0,%[reg0]\n" + " lgr 1,%[reg1]\n" + " .long 0xb2f01000\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), "+m" (*parm) + : [reg0] "d" ((unsigned long)command), + [reg1] "d" ((unsigned long)parm) + : "cc", "0", "1"); + return cc; } static inline int iucv_call_b2f0(int command, union iucv_param *parm) @@ -319,19 +319,21 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) */ static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) { - register unsigned long reg0 asm ("0"); - register unsigned long reg1 asm ("1"); - int ccode; + unsigned long reg1 = (unsigned long)param; + int cc; - reg0 = IUCV_QUERY; - reg1 = (unsigned long) param; asm volatile ( + " lghi 0,%[cmd]\n" + " lgr 1,%[reg1]\n" " .long 0xb2f01000\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc"); + " ipm %[cc]\n" + " srl %[cc],28\n" + " lgr %[reg1],1\n" + : [cc] "=&d" (cc), [reg1] "+&d" (reg1) + : [cmd] "K" (IUCV_QUERY) + : "cc", "0", "1"); *max_pathid = reg1; - return ccode; + return cc; } static int iucv_query_maxconn(void) @@ -500,14 +502,14 @@ static void iucv_setmask_mp(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) /* Enable all cpus with a declared buffer. */ if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) && !cpumask_test_cpu(cpu, &iucv_irq_cpumask)) smp_call_function_single(cpu, iucv_allow_cpu, NULL, 1); - put_online_cpus(); + cpus_read_unlock(); } /** @@ -540,7 +542,7 @@ static int iucv_enable(void) size_t alloc_size; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); rc = -ENOMEM; alloc_size = iucv_max_pathid * sizeof(struct iucv_path); iucv_path_table = kzalloc(alloc_size, GFP_KERNEL); @@ -553,12 +555,12 @@ static int iucv_enable(void) if (cpumask_empty(&iucv_buffer_cpumask)) /* No cpu could declare an iucv buffer. */ goto out; - put_online_cpus(); + cpus_read_unlock(); return 0; out: kfree(iucv_path_table); iucv_path_table = NULL; - put_online_cpus(); + cpus_read_unlock(); return rc; } @@ -571,11 +573,11 @@ out: */ static void iucv_disable(void) { - get_online_cpus(); + cpus_read_lock(); on_each_cpu(iucv_retrieve_cpu, NULL, 1); kfree(iucv_path_table); iucv_path_table = NULL; - put_online_cpus(); + cpus_read_unlock(); } static int iucv_cpu_dead(unsigned int cpu) @@ -784,7 +786,7 @@ static int iucv_reboot_event(struct notifier_block *this, if (cpumask_empty(&iucv_irq_cpumask)) return NOTIFY_DONE; - get_online_cpus(); + cpus_read_lock(); on_each_cpu_mask(&iucv_irq_cpumask, iucv_block_cpu, NULL, 1); preempt_disable(); for (i = 0; i < iucv_max_pathid; i++) { @@ -792,7 +794,7 @@ static int iucv_reboot_event(struct notifier_block *this, iucv_sever_pathid(i, NULL); } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); iucv_disable(); return NOTIFY_DONE; } @@ -1635,14 +1637,16 @@ struct iucv_message_pending { u8 iptype; u32 ipmsgid; u32 iptrgcls; - union { - u32 iprmmsg1_u32; - u8 iprmmsg1[4]; - } ln1msg1; - union { - u32 ipbfln1f; - u8 iprmmsg2[4]; - } ln1msg2; + struct { + union { + u32 iprmmsg1_u32; + u8 iprmmsg1[4]; + } ln1msg1; + union { + u32 ipbfln1f; + u8 iprmmsg2[4]; + } ln1msg2; + } rmmsg; u32 res1[3]; u32 ipbfln2f; u8 ippollfg; @@ -1660,10 +1664,10 @@ static void iucv_message_pending(struct iucv_irq_data *data) msg.id = imp->ipmsgid; msg.class = imp->iptrgcls; if (imp->ipflags1 & IUCV_IPRMDATA) { - memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8); + memcpy(msg.rmmsg, &imp->rmmsg, 8); msg.length = 8; } else - msg.length = imp->ln1msg2.ipbfln1f; + msg.length = imp->rmmsg.ln1msg2.ipbfln1f; msg.reply_size = imp->ipbfln2f; path->handler->message_pending(path, &msg); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 6201965bd822..11a715d76a4f 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -47,7 +47,7 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; - csk->sk_error_report(csk); + sk_error_report(csk); } static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, diff --git a/net/key/af_key.c b/net/key/af_key.c index ef9b4ac03e7b..de24a7d474df 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -141,7 +141,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; - int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -150,10 +149,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; - err = -ENOMEM; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) - goto out; + return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); @@ -169,8 +167,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, pfkey_insert(sk); return 0; -out: - return err; } static int pfkey_release(struct socket *sock) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 97ae1255fcb6..b3edafa5fba4 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -488,7 +488,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - /* We dont need to clone dst here, it is guaranteed to not disappear. + /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); @@ -635,7 +635,6 @@ static struct inet_protosw l2tp_ip_protosw = { static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, - .netns_ok = 1, }; static int __init l2tp_ip_init(void) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index aea85f91f059..bf35710127dd 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -226,7 +226,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should - * be negociated and handled at the PPP layer. These fields are + * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 1078e14f1acf..0971ca48ba15 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -80,11 +80,9 @@ static void __lapb_insert_cb(struct lapb_cb *lapb) static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) { - struct list_head *entry; struct lapb_cb *lapb, *use = NULL; - list_for_each(entry, &lapb_list) { - lapb = list_entry(entry, struct lapb_cb, node); + list_for_each_entry(lapb, &lapb_list, node) { if (lapb->dev == dev) { use = lapb; break; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 7180979114e4..3086f4a6ae68 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -98,8 +98,16 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; - if (addr->sllc_test || addr->sllc_xid) + if (addr->sllc_test) rc = LLC_PDU_LEN_U; + else if (addr->sllc_xid) + /* We need to expand header to sizeof(struct llc_xid_info) + * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header + * as XID PDU. In llc_ui_sendmsg() we reserved header size and then + * filled all other space with user data. If we won't reserve this + * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data + */ + rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; @@ -216,8 +224,7 @@ static int llc_ui_release(struct socket *sock) } else { release_sock(sk); } - if (llc->dev) - dev_put(llc->dev); + dev_put(llc->dev); sock_put(sk); llc_sk_free(sk); out: @@ -355,8 +362,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) } else llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); - if (llc->dev) - dev_hold(llc->dev); + dev_hold(llc->dev); rcu_read_unlock(); if (!llc->dev) goto out; diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index b554f26c68ee..79d1cef8f15a 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -79,7 +79,7 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) struct llc_sap_state_ev *ev = llc_sap_ev(skb); int rc; - llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7a99892e5aba..d69b31c20fe2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -152,6 +152,8 @@ static int ieee80211_change_iface(struct wiphy *wiphy, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; int ret; ret = ieee80211_if_change_type(sdata, type); @@ -162,7 +164,24 @@ static int ieee80211_change_iface(struct wiphy *wiphy, RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (params->use_4addr == ifmgd->use_4addr) + return 0; + sdata->u.mgd.use_4addr = params->use_4addr; + if (!ifmgd->associated) + return 0; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get(sdata, ifmgd->bssid); + if (sta) + drv_sta_set_4addr(local, sdata, &sta->sta, + params->use_4addr); + mutex_unlock(&local->sta_mtx); + + if (params->use_4addr) + ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { @@ -809,9 +828,11 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, return ret; } -static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, - const u8 *resp, size_t resp_len, - const struct ieee80211_csa_settings *csa) +static int +ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, + const u8 *resp, size_t resp_len, + const struct ieee80211_csa_settings *csa, + const struct ieee80211_color_change_settings *cca) { struct probe_resp *new, *old; @@ -831,6 +852,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); + else if (cca) + new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(sdata->u.ap.probe_resp, new); if (old) @@ -936,7 +959,8 @@ static int ieee80211_set_ftm_responder_params( static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_beacon_data *params, - const struct ieee80211_csa_settings *csa) + const struct ieee80211_csa_settings *csa, + const struct ieee80211_color_change_settings *cca) { struct beacon_data *new, *old; int new_head_len, new_tail_len; @@ -985,6 +1009,9 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); + } else if (cca) { + new->cntdwn_current_counter = cca->count; + new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ @@ -1001,7 +1028,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, - params->probe_resp_len, csa); + params->probe_resp_len, csa, cca); if (err < 0) { kfree(new); return err; @@ -1156,7 +1183,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sdata->vif.bss_conf.beacon_tx_rate = params->beacon_rate; - err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL, NULL); if (err < 0) goto error; changed |= err; @@ -1211,17 +1238,17 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata_assert_lock(sdata); - /* don't allow changing the beacon while CSA is in place - offset + /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ - if (sdata->vif.csa_active) + if (sdata->vif.csa_active || sdata->vif.color_change_active) return -EBUSY; old = sdata_dereference(sdata->u.ap.beacon, sdata); if (!old) return -ENOENT; - err = ieee80211_assign_beacon(sdata, params, NULL); + err = ieee80211_assign_beacon(sdata, params, NULL, NULL); if (err < 0) return err; ieee80211_bss_info_change_notify(sdata, err); @@ -1442,6 +1469,38 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, #endif } +static void sta_apply_airtime_params(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ + u8 ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + struct airtime_sched_info *air_sched = &local->airtime[ac]; + struct airtime_info *air_info = &sta->airtime[ac]; + struct txq_info *txqi; + u8 tid; + + spin_lock_bh(&air_sched->lock); + for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) { + if (air_info->weight == params->airtime_weight || + !sta->sta.txq[tid] || + ac != ieee80211_ac_from_tid(tid)) + continue; + + airtime_weight_set(air_info, params->airtime_weight); + + txqi = to_txq_info(sta->sta.txq[tid]); + if (RB_EMPTY_NODE(&txqi->schedule_order)) + continue; + + ieee80211_update_airtime_weight(local, air_sched, + 0, true); + } + spin_unlock_bh(&air_sched->lock); + } +} + static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) @@ -1629,7 +1688,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) - sta->airtime_weight = params->airtime_weight; + sta_apply_airtime_params(local, sta, params); + /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || @@ -1693,15 +1753,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init(sta); - err = sta_info_insert_rcu(sta); - if (err) { - rcu_read_unlock(); - return err; - } - - rcu_read_unlock(); - - return 0; + return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, @@ -3112,7 +3164,7 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, - NULL); + NULL, NULL); kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; @@ -3278,7 +3330,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; - err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa, NULL); if (err < 0) { kfree(sdata->u.ap.next_beacon); return err; @@ -3367,6 +3419,15 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, return 0; } +static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) +{ + sdata->vif.color_change_active = false; + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; + + cfg80211_color_change_aborted_notify(sdata->dev); +} + static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) @@ -3435,6 +3496,10 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + /* if there is a color change in progress, abort it */ + if (sdata->vif.color_change_active) + ieee80211_color_change_abort(sdata); + err = ieee80211_set_csa_beacon(sdata, params, &changed); if (err) { ieee80211_vif_unreserve_chanctx(sdata); @@ -4086,6 +4151,196 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy, return local->ops->set_sar_specs(&local->hw, sar); } +static int +ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, + u32 *changed) +{ + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: { + int ret; + + ret = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, + NULL, NULL); + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; + + if (ret < 0) + return ret; + + *changed |= ret; + break; + } + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + +static int +ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_color_change_settings *params, + u32 *changed) +{ + struct ieee80211_color_change_settings color_change = {}; + int err; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + sdata->u.ap.next_beacon = + cfg80211_beacon_dup(¶ms->beacon_next); + if (!sdata->u.ap.next_beacon) + return -ENOMEM; + + if (params->count <= 1) + break; + + color_change.counter_offset_beacon = + params->counter_offset_beacon; + color_change.counter_offset_presp = + params->counter_offset_presp; + color_change.count = params->count; + + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_color_change, + NULL, &color_change); + if (err < 0) { + kfree(sdata->u.ap.next_beacon); + return err; + } + *changed |= err; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void +ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, + u8 color, int enable, u32 changed) +{ + sdata->vif.bss_conf.he_bss_color.color = color; + sdata->vif.bss_conf.he_bss_color.enabled = enable; + changed |= BSS_CHANGED_HE_BSS_COLOR; + + ieee80211_bss_info_change_notify(sdata, changed); +} + +static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + int err; + + sdata_assert_lock(sdata); + lockdep_assert_held(&local->mtx); + + sdata->vif.color_change_active = false; + + err = ieee80211_set_after_color_change_beacon(sdata, &changed); + if (err) { + cfg80211_color_change_aborted_notify(sdata->dev); + return err; + } + + ieee80211_color_change_bss_config_notify(sdata, + sdata->vif.color_change_color, + 1, changed); + cfg80211_color_change_notify(sdata->dev); + + return 0; +} + +void ieee80211_color_change_finalize_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + color_change_finalize_work); + struct ieee80211_local *local = sdata->local; + + sdata_lock(sdata); + mutex_lock(&local->mtx); + + /* AP might have been stopped while waiting for the lock. */ + if (!sdata->vif.color_change_active) + goto unlock; + + if (!ieee80211_sdata_running(sdata)) + goto unlock; + + ieee80211_color_change_finalize(sdata); + +unlock: + mutex_unlock(&local->mtx); + sdata_unlock(sdata); +} + +void ieee80211_color_change_finish(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + ieee80211_queue_work(&sdata->local->hw, + &sdata->color_change_finalize_work); +} +EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); + +void +ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif, + u64 color_bitmap) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (sdata->vif.color_change_active || sdata->vif.csa_active) + return; + + cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap); +} +EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify); + +static int +ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_color_change_settings *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + int err; + + sdata_assert_lock(sdata); + + mutex_lock(&local->mtx); + + /* don't allow another color change if one is already active or if csa + * is active + */ + if (sdata->vif.color_change_active || sdata->vif.csa_active) { + err = -EBUSY; + goto out; + } + + err = ieee80211_set_color_change_beacon(sdata, params, &changed); + if (err) + goto out; + + sdata->vif.color_change_active = true; + sdata->vif.color_change_color = params->color; + + cfg80211_color_change_started_notify(sdata->dev, params->count); + + if (changed) + ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed); + else + /* if the beacon didn't change, we can finalize immediately */ + ieee80211_color_change_finalize(sdata); + +out: + mutex_unlock(&local->mtx); + + return err; +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -4189,4 +4444,5 @@ const struct cfg80211_ops mac80211_config_ops = { .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, + .color_change = ieee80211_color_change, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 907bb1f748a1..76fc36a68750 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management + * Copyright 2020 - 2021 Intel Corporation */ #include <linux/nl80211.h> @@ -308,8 +309,8 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, * the max of min required widths of all the interfaces bound to this * channel context. */ -void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; @@ -326,7 +327,7 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; - return; + return 0; } max_bw = ieee80211_get_chanctx_max_required_bw(local, &ctx->conf); @@ -337,17 +338,21 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ieee80211_chandef_downgrade(&min_def); if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def)) - return; + return 0; ctx->conf.min_def = min_def; if (!ctx->driver_present) - return; + return 0; - drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH); + return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; } +/* calling this function is assuming that station vif is updated to + * lates changes by calling ieee80211_vif_update_chandef + */ static void ieee80211_chan_bw_change(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool narrowed) { struct sta_info *sta; struct ieee80211_supported_band *sband = @@ -366,9 +371,16 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, continue; new_sta_bw = ieee80211_sta_cur_vht_bw(sta); + + /* nothing change */ if (new_sta_bw == sta->sta.bandwidth) continue; + /* vif changed to narrow BW and narrow BW for station wasn't + * requested or vise versa */ + if ((new_sta_bw < sta->sta.bandwidth) == !narrowed) + continue; + sta->sta.bandwidth = new_sta_bw; rate_control_rate_update(local, sband, sta, IEEE80211_RC_BW_CHANGED); @@ -376,21 +388,34 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, rcu_read_unlock(); } -static void ieee80211_change_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *chandef) +/* + * recalc the min required chan width of the channel context, which is + * the max of min required widths of all the interfaces bound to this + * channel context. + */ +void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { - enum nl80211_chan_width width; + u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx); - if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { - ieee80211_recalc_chanctx_min_def(local, ctx); + if (!changed) return; - } - WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + /* check is BW narrowed */ + ieee80211_chan_bw_change(local, ctx, true); - width = ctx->conf.def.width; - ctx->conf.def = *chandef; + drv_change_chanctx(local, ctx, changed); + + /* check is BW wider */ + ieee80211_chan_bw_change(local, ctx, false); +} + +static void ieee80211_change_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx *old_ctx, + const struct cfg80211_chan_def *chandef) +{ + u32 changed; /* expected to handle only 20/40/80/160 channel widths */ switch (chandef->width) { @@ -405,19 +430,33 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, WARN_ON(1); } - if (chandef->width < width) - ieee80211_chan_bw_change(local, ctx); + /* Check maybe BW narrowed - we do this _before_ calling recalc_chanctx_min_def + * due to maybe not returning from it, e.g in case new context was added + * first time with all parameters up to date. + */ + ieee80211_chan_bw_change(local, old_ctx, true); + + if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { + ieee80211_recalc_chanctx_min_def(local, ctx); + return; + } - drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH); - ieee80211_recalc_chanctx_min_def(local, ctx); + WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + + ctx->conf.def = *chandef; + + /* check if min chanctx also changed */ + changed = IEEE80211_CHANCTX_CHANGE_WIDTH | + _ieee80211_recalc_chanctx_min_def(local, ctx); + drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { local->_oper_chandef = *chandef; ieee80211_hw_config(local, 0); } - if (chandef->width > width) - ieee80211_chan_bw_change(local, ctx); + /* check is BW wider */ + ieee80211_chan_bw_change(local, old_ctx, false); } static struct ieee80211_chanctx * @@ -450,7 +489,7 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (!compat) continue; - ieee80211_change_chanctx(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; } @@ -679,7 +718,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, if (!compat) return; - ieee80211_change_chanctx(local, ctx, compat); + ieee80211_change_chanctx(local, ctx, ctx, compat); } static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, @@ -1107,13 +1146,12 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; - if (old_ctx->conf.def.width > new_ctx->conf.def.width) - ieee80211_chan_bw_change(local, new_ctx); + if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) + changed = BSS_CHANGED_BANDWIDTH; - ieee80211_change_chanctx(local, new_ctx, chandef); + ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef); - if (old_ctx->conf.def.width < new_ctx->conf.def.width) - ieee80211_chan_bw_change(local, new_ctx); + ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; @@ -1142,14 +1180,9 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); - if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) - changed = BSS_CHANGED_BANDWIDTH; - - ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef); - + ieee80211_recalc_chanctx_min_def(local, new_ctx); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); - ieee80211_recalc_chanctx_min_def(local, new_ctx); if (changed) ieee80211_bss_info_change_notify(sdata, changed); @@ -1188,7 +1221,7 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; - ieee80211_change_chanctx(local, new_ctx, chandef); + ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef); list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; @@ -1505,7 +1538,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); ieee80211_recalc_chanctx_min_def(local, ctx); - ieee80211_chan_bw_change(local, ctx); list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, reserved_chanctx_list) { diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index fc34ae2b604c..8dbfe325ee66 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -216,14 +216,14 @@ static ssize_t aql_txq_limit_read(struct file *file, "VI %u %u\n" "BE %u %u\n" "BK %u %u\n", - local->aql_txq_limit_low[IEEE80211_AC_VO], - local->aql_txq_limit_high[IEEE80211_AC_VO], - local->aql_txq_limit_low[IEEE80211_AC_VI], - local->aql_txq_limit_high[IEEE80211_AC_VI], - local->aql_txq_limit_low[IEEE80211_AC_BE], - local->aql_txq_limit_high[IEEE80211_AC_BE], - local->aql_txq_limit_low[IEEE80211_AC_BK], - local->aql_txq_limit_high[IEEE80211_AC_BK]); + local->airtime[IEEE80211_AC_VO].aql_txq_limit_low, + local->airtime[IEEE80211_AC_VO].aql_txq_limit_high, + local->airtime[IEEE80211_AC_VI].aql_txq_limit_low, + local->airtime[IEEE80211_AC_VI].aql_txq_limit_high, + local->airtime[IEEE80211_AC_BE].aql_txq_limit_low, + local->airtime[IEEE80211_AC_BE].aql_txq_limit_high, + local->airtime[IEEE80211_AC_BK].aql_txq_limit_low, + local->airtime[IEEE80211_AC_BK].aql_txq_limit_high); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } @@ -255,11 +255,11 @@ static ssize_t aql_txq_limit_write(struct file *file, if (ac >= IEEE80211_NUM_ACS) return -EINVAL; - q_limit_low_old = local->aql_txq_limit_low[ac]; - q_limit_high_old = local->aql_txq_limit_high[ac]; + q_limit_low_old = local->airtime[ac].aql_txq_limit_low; + q_limit_high_old = local->airtime[ac].aql_txq_limit_high; - local->aql_txq_limit_low[ac] = q_limit_low; - local->aql_txq_limit_high[ac] = q_limit_high; + local->airtime[ac].aql_txq_limit_low = q_limit_low; + local->airtime[ac].aql_txq_limit_high = q_limit_high; mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { @@ -382,6 +382,46 @@ static const struct file_operations force_tx_status_ops = { .llseek = default_llseek, }; +static ssize_t airtime_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[200]; + u64 v_t[IEEE80211_NUM_ACS]; + u64 wt[IEEE80211_NUM_ACS]; + int len = 0, ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + spin_lock_bh(&local->airtime[ac].lock); + v_t[ac] = local->airtime[ac].v_t; + wt[ac] = local->airtime[ac].weight_sum; + spin_unlock_bh(&local->airtime[ac].lock); + } + len = scnprintf(buf, sizeof(buf), + "\tVO VI BE BK\n" + "Virt-t\t%-10llu %-10llu %-10llu %-10llu\n" + "Weight\t%-10llu %-10llu %-10llu %-10llu\n", + v_t[0], + v_t[1], + v_t[2], + v_t[3], + wt[0], + wt[1], + wt[2], + wt[3]); + + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static const struct file_operations airtime_ops = { + .read = airtime_read, + .open = simple_open, + .llseek = default_llseek, +}; + #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -632,7 +672,11 @@ void debugfs_hw_add(struct ieee80211_local *local) if (local->ops->wake_tx_queue) DEBUGFS_ADD_MODE(aqm, 0600); - DEBUGFS_ADD_MODE(airtime_flags, 0600); + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { + DEBUGFS_ADD_MODE(airtime, 0600); + DEBUGFS_ADD_MODE(airtime_flags, 0600); + } DEBUGFS_ADD(aql_txq_limit); debugfs_create_u32("aql_threshold", 0600, diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 0ad3860852ff..db724fc10a5f 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -57,7 +57,6 @@ static ssize_t ieee80211_if_write( return -EFAULT; buf[count] = '\0'; - ret = -ENODEV; rtnl_lock(); ret = (*write)(sdata, buf, count); rtnl_unlock(); @@ -513,6 +512,34 @@ static ssize_t ieee80211_if_fmt_aqm( } IEEE80211_IF_FILE_R(aqm); +static ssize_t ieee80211_if_fmt_airtime( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_txq *txq = sdata->vif.txq; + struct airtime_info *air_info; + int len; + + if (!txq) + return 0; + + spin_lock_bh(&local->airtime[txq->ac].lock); + air_info = to_airtime_info(txq); + len = scnprintf(buf, + buflen, + "RX: %llu us\nTX: %llu us\nWeight: %u\n" + "Virt-T: %lld us\n", + air_info->rx_airtime, + air_info->tx_airtime, + air_info->weight, + air_info->v_t); + spin_unlock_bh(&local->airtime[txq->ac].lock); + + return len; +} + +IEEE80211_IF_FILE_R(airtime); + IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX); /* IBSS attributes */ @@ -658,8 +685,10 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) if (sdata->local->ops->wake_tx_queue && sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_NAN) + sdata->vif.type != NL80211_IFTYPE_NAN) { DEBUGFS_ADD(aqm); + DEBUGFS_ADD(airtime); + } } static void add_sta_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 936c9dfa86c8..8be28cfd6f64 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -202,7 +202,7 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, size_t bufsz = 400; char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; u64 rx_airtime = 0, tx_airtime = 0; - s64 deficit[IEEE80211_NUM_ACS]; + u64 v_t[IEEE80211_NUM_ACS]; ssize_t rv; int ac; @@ -210,18 +210,18 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf, return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); rx_airtime += sta->airtime[ac].rx_airtime; tx_airtime += sta->airtime[ac].tx_airtime; - deficit[ac] = sta->airtime[ac].deficit; - spin_unlock_bh(&local->active_txq_lock[ac]); + v_t[ac] = sta->airtime[ac].v_t; + spin_unlock_bh(&local->airtime[ac].lock); } p += scnprintf(p, bufsz + buf - p, "RX: %llu us\nTX: %llu us\nWeight: %u\n" - "Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", - rx_airtime, tx_airtime, sta->airtime_weight, - deficit[0], deficit[1], deficit[2], deficit[3]); + "Virt-T: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n", + rx_airtime, tx_airtime, sta->airtime[0].weight, + v_t[0], v_t[1], v_t[2], v_t[3]); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); @@ -236,11 +236,11 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf, int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); sta->airtime[ac].rx_airtime = 0; sta->airtime[ac].tx_airtime = 0; - sta->airtime[ac].deficit = sta->airtime_weight; - spin_unlock_bh(&local->active_txq_lock[ac]); + sta->airtime[ac].v_t = 0; + spin_unlock_bh(&local->airtime[ac].lock); } return count; @@ -263,10 +263,10 @@ static ssize_t sta_aql_read(struct file *file, char __user *userbuf, return -ENOMEM; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - spin_lock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&local->airtime[ac].lock); q_limit_l[ac] = sta->airtime[ac].aql_limit_low; q_limit_h[ac] = sta->airtime[ac].aql_limit_high; - spin_unlock_bh(&local->active_txq_lock[ac]); + spin_unlock_bh(&local->airtime[ac].lock); q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 604ca59937f0..cd3731cbf6c6 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018 - 2019 Intel Corporation +* Copyright (C) 2018 - 2019, 2021 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -821,7 +821,7 @@ drv_allow_buffered_frames(struct ieee80211_local *local, static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - u16 duration) + struct ieee80211_prep_tx_info *info) { might_sleep(); @@ -829,9 +829,27 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); - trace_drv_mgd_prepare_tx(local, sdata, duration); + trace_drv_mgd_prepare_tx(local, sdata, info->duration, + info->subtype, info->success); if (local->ops->mgd_prepare_tx) - local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration); + local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info); + trace_drv_return_void(local); +} + +static inline void drv_mgd_complete_tx(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_prep_tx_info *info) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); + + trace_drv_mgd_complete_tx(local, sdata, info->duration, + info->subtype, info->success); + if (local->ops->mgd_complete_tx) + local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info); trace_drv_return_void(local); } @@ -1429,4 +1447,40 @@ static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_add_twt_setup(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_twt_setup *twt) +{ + struct ieee80211_twt_params *twt_agrt; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + twt_agrt = (void *)twt->params; + + trace_drv_add_twt_setup(local, sta, twt, twt_agrt); + local->ops->add_twt_setup(&local->hw, sta, twt); + trace_drv_return_void(local); +} + +static inline void drv_twt_teardown_request(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + u8 flowid) +{ + might_sleep(); + if (!check_sdata_in_driver(sdata)) + return; + + if (!local->ops->twt_teardown_request) + return; + + trace_drv_twt_teardown_request(local, sta, flowid); + local->ops->twt_teardown_request(&local->hw, sta, flowid); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 0c0b970835ce..c05af7018f79 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -111,7 +111,7 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; - struct ieee80211_sta_he_cap own_he_cap = sband->iftype_data->he_cap; + struct ieee80211_sta_he_cap own_he_cap; struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; u8 he_ppe_size; u8 mcs_nss_size; @@ -120,9 +120,13 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, memset(he_cap, 0, sizeof(*he_cap)); - if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + if (!he_cap_ie || + !ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) return; + own_he_cap = sband->iftype_data->he_cap; + /* Make sure size is OK */ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); he_ppe_size = diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 3d62a80b5790..2eb7641f5556 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020 Intel Corporation + * Copyright(c) 2020-2021 Intel Corporation */ #include <linux/ieee80211.h> @@ -555,17 +555,15 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION && - vif->type != NL80211_IFTYPE_AP)) + if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION)) return; - if (vif->type == NL80211_IFTYPE_STATION) { - if (sdata->u.mgd.driver_smps_mode == smps_mode) - return; - sdata->u.mgd.driver_smps_mode = smps_mode; - ieee80211_queue_work(&sdata->local->hw, - &sdata->u.mgd.request_smps_work); - } + if (sdata->u.mgd.driver_smps_mode == smps_mode) + return; + + sdata->u.mgd.driver_smps_mode = smps_mode; + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); } /* this might change ... don't want non-open drivers using it */ EXPORT_SYMBOL_GPL(ieee80211_request_smps); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index a7ac53a2f00d..5d6ca4c3e698 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -489,7 +489,6 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies; u16 capability = WLAN_CAPABILITY_IBSS; u64 tsf; - int ret = 0; sdata_assert_lock(sdata); @@ -501,10 +500,8 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); - if (WARN_ON(!cbss)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON(!cbss)) + return -EINVAL; rcu_read_lock(); ies = rcu_dereference(cbss->ies); @@ -520,18 +517,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.basic_rates, capability, tsf, &ifibss->chandef, NULL, csa_settings); - if (!presp) { - ret = -ENOMEM; - goto out; - } + if (!presp) + return -ENOMEM; rcu_assign_pointer(ifibss->presp, presp); if (old_presp) kfree_rcu(old_presp, rcu_head); return BSS_CHANGED_BEACON; - out: - return ret; } int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 648696b49f89..159af6c3ffb0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #ifndef IEEE80211_I_H @@ -25,6 +25,7 @@ #include <linux/leds.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/rbtree.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> @@ -244,6 +245,12 @@ struct ieee80211_csa_settings { u8 count; }; +struct ieee80211_color_change_settings { + u16 counter_offset_beacon; + u16 counter_offset_presp; + u8 count; +}; + struct beacon_data { u8 *head, *tail; int head_len, tail_len; @@ -831,17 +838,16 @@ enum txq_info_flags { * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin * @def_cvars: codel vars for @def_flow - * @frags: used to keep fragments created after dequeue * @schedule_order: used with ieee80211_local->active_txqs - * @schedule_round: counter to prevent infinite loops on TXQ scheduling + * @frags: used to keep fragments created after dequeue */ struct txq_info { struct fq_tin tin; struct codel_vars def_cvars; struct codel_stats cstats; + struct rb_node schedule_order; + struct sk_buff_head frags; - struct list_head schedule_order; - u16 schedule_round; unsigned long flags; /* keep last! */ @@ -918,10 +924,14 @@ struct ieee80211_sub_if_data { struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; + struct airtime_info airtime[IEEE80211_NUM_ACS]; + struct work_struct csa_finalize_work; bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; + struct work_struct color_change_finalize_work; + struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */ struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */ @@ -936,6 +946,7 @@ struct ieee80211_sub_if_data { struct work_struct work; struct sk_buff_head skb_queue; + struct sk_buff_head status_queue; u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; @@ -1130,6 +1141,44 @@ enum mac80211_scan_state { SCAN_ABORT, }; +/** + * struct airtime_sched_info - state used for airtime scheduling and AQL + * + * @lock: spinlock that protects all the fields in this struct + * @active_txqs: rbtree of currently backlogged queues, sorted by virtual time + * @schedule_pos: the current position maintained while a driver walks the tree + * with ieee80211_next_txq() + * @active_list: list of struct airtime_info structs that were active within + * the last AIRTIME_ACTIVE_DURATION (100 ms), used to compute + * weight_sum + * @last_weight_update: used for rate limiting walking active_list + * @last_schedule_time: tracks the last time a transmission was scheduled; used + * for catching up v_t if no stations are eligible for + * transmission. + * @v_t: global virtual time; queues with v_t < this are eligible for + * transmission + * @weight_sum: total sum of all active stations used for dividing airtime + * @weight_sum_reciprocal: reciprocal of weight_sum (to avoid divisions in fast + * path - see comment above + * IEEE80211_RECIPROCAL_DIVISOR_64) + * @aql_txq_limit_low: AQL limit when total outstanding airtime + * is < IEEE80211_AQL_THRESHOLD + * @aql_txq_limit_high: AQL limit when total outstanding airtime + * is > IEEE80211_AQL_THRESHOLD + */ +struct airtime_sched_info { + spinlock_t lock; + struct rb_root_cached active_txqs; + struct rb_node *schedule_pos; + struct list_head active_list; + u64 last_weight_update; + u64 last_schedule_activity; + u64 v_t; + u64 weight_sum; + u64 weight_sum_reciprocal; + u32 aql_txq_limit_low; + u32 aql_txq_limit_high; +}; DECLARE_STATIC_KEY_FALSE(aql_disable); struct ieee80211_local { @@ -1143,13 +1192,8 @@ struct ieee80211_local { struct codel_params cparams; /* protects active_txqs and txqi->schedule_order */ - spinlock_t active_txq_lock[IEEE80211_NUM_ACS]; - struct list_head active_txqs[IEEE80211_NUM_ACS]; - u16 schedule_round[IEEE80211_NUM_ACS]; - + struct airtime_sched_info airtime[IEEE80211_NUM_ACS]; u16 airtime_flags; - u32 aql_txq_limit_low[IEEE80211_NUM_ACS]; - u32 aql_txq_limit_high[IEEE80211_NUM_ACS]; u32 aql_threshold; atomic_t aql_total_pending_airtime; @@ -1414,10 +1458,6 @@ struct ieee80211_local { /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; - - /* TDLS channel switch */ - struct work_struct tdls_chsw_work; - struct sk_buff_head skb_queue_tdls_chsw; }; static inline struct ieee80211_sub_if_data * @@ -1494,6 +1534,7 @@ struct ieee802_11_elems { const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_tx_pwr_env *tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; @@ -1544,6 +1585,8 @@ struct ieee802_11_elems { u8 perr_len; u8 country_elem_len; u8 bssid_index_len; + u8 tx_pwr_env_len[IEEE80211_TPE_MAX_IE_COUNT]; + u8 tx_pwr_env_num; /* whether a parse error occurred while retrieving these elements */ bool parse_error; @@ -1567,6 +1610,125 @@ static inline bool txq_has_queue(struct ieee80211_txq *txq) return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); } +static inline struct airtime_info *to_airtime_info(struct ieee80211_txq *txq) +{ + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + + if (txq->sta) { + sta = container_of(txq->sta, struct sta_info, sta); + return &sta->airtime[txq->ac]; + } + + sdata = vif_to_sdata(txq->vif); + return &sdata->airtime[txq->ac]; +} + +/* To avoid divisions in the fast path, we keep pre-computed reciprocals for + * airtime weight calculations. There are two different weights to keep track + * of: The per-station weight and the sum of weights per phy. + * + * For the per-station weights (kept in airtime_info below), we use 32-bit + * reciprocals with a devisor of 2^19. This lets us keep the multiplications and + * divisions for the station weights as 32-bit operations at the cost of a bit + * of rounding error for high weights; but the choice of divisor keeps rounding + * errors <10% for weights <2^15, assuming no more than 8ms of airtime is + * reported at a time. + * + * For the per-phy sum of weights the values can get higher, so we use 64-bit + * operations for those with a 32-bit divisor, which should avoid any + * significant rounding errors. + */ +#define IEEE80211_RECIPROCAL_DIVISOR_64 0x100000000ULL +#define IEEE80211_RECIPROCAL_SHIFT_64 32 +#define IEEE80211_RECIPROCAL_DIVISOR_32 0x80000U +#define IEEE80211_RECIPROCAL_SHIFT_32 19 + +static inline void airtime_weight_set(struct airtime_info *air_info, u16 weight) +{ + if (air_info->weight == weight) + return; + + air_info->weight = weight; + if (weight) { + air_info->weight_reciprocal = + IEEE80211_RECIPROCAL_DIVISOR_32 / weight; + } else { + air_info->weight_reciprocal = 0; + } +} + +static inline void airtime_weight_sum_set(struct airtime_sched_info *air_sched, + int weight_sum) +{ + if (air_sched->weight_sum == weight_sum) + return; + + air_sched->weight_sum = weight_sum; + if (air_sched->weight_sum) { + air_sched->weight_sum_reciprocal = IEEE80211_RECIPROCAL_DIVISOR_64; + do_div(air_sched->weight_sum_reciprocal, air_sched->weight_sum); + } else { + air_sched->weight_sum_reciprocal = 0; + } +} + +/* A problem when trying to enforce airtime fairness is that we want to divide + * the airtime between the currently *active* stations. However, basing this on + * the instantaneous queue state of stations doesn't work, as queues tend to + * oscillate very quickly between empty and occupied, leading to the scheduler + * thinking only a single station is active when deciding whether to allow + * transmission (and thus not throttling correctly). + * + * To fix this we use a timer-based notion of activity: a station is considered + * active if it has been scheduled within the last 100 ms; we keep a separate + * list of all the stations considered active in this manner, and lazily update + * the total weight of active stations from this list (filtering the stations in + * the list by their 'last active' time). + * + * We add one additional safeguard to guard against stations that manage to get + * scheduled every 100 ms but don't transmit a lot of data, and thus don't use + * up any airtime. Such stations would be able to get priority for an extended + * period of time if they do start transmitting at full capacity again, and so + * we add an explicit maximum for how far behind a station is allowed to fall in + * the virtual airtime domain. This limit is set to a relatively high value of + * 20 ms because the main mechanism for catching up idle stations is the active + * state as described above; i.e., the hard limit should only be hit in + * pathological cases. + */ +#define AIRTIME_ACTIVE_DURATION (100 * NSEC_PER_MSEC) +#define AIRTIME_MAX_BEHIND 20000 /* 20 ms */ + +static inline bool airtime_is_active(struct airtime_info *air_info, u64 now) +{ + return air_info->last_scheduled >= now - AIRTIME_ACTIVE_DURATION; +} + +static inline void airtime_set_active(struct airtime_sched_info *air_sched, + struct airtime_info *air_info, u64 now) +{ + air_info->last_scheduled = now; + air_sched->last_schedule_activity = now; + list_move_tail(&air_info->list, &air_sched->active_list); +} + +static inline bool airtime_catchup_v_t(struct airtime_sched_info *air_sched, + u64 v_t, u64 now) +{ + air_sched->v_t = v_t; + return true; +} + +static inline void init_airtime_info(struct airtime_info *air_info, + struct airtime_sched_info *air_sched) +{ + atomic_set(&air_info->aql_tx_pending, 0); + air_info->aql_limit_low = air_sched->aql_txq_limit_low; + air_info->aql_limit_high = air_sched->aql_txq_limit_high; + airtime_weight_set(air_info, IEEE80211_DEFAULT_AIRTIME_WEIGHT); + INIT_LIST_HEAD(&air_info->list); +} + static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { return ether_addr_equal(raddr, addr) || @@ -1738,6 +1900,9 @@ void ieee80211_csa_finalize_work(struct work_struct *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); +/* color change handling */ +void ieee80211_color_change_finalize_work(struct work_struct *work); + /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM | NETIF_F_SG | \ @@ -1809,6 +1974,14 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); +void ieee80211_resort_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); +void ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge); +void ieee80211_update_airtime_weight(struct ieee80211_local *local, + struct airtime_sched_info *air_sched, + u64 now, bool force); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, @@ -1879,7 +2052,6 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta); enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta); -void ieee80211_sta_set_rx_nss(struct sta_info *sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, @@ -1912,6 +2084,11 @@ ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, /* S1G */ void ieee80211_s1g_sta_rate_init(struct sta_info *sta); +bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb); +void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -2045,6 +2222,8 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); +void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); @@ -2287,9 +2466,13 @@ void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); -void ieee80211_tdls_chsw_work(struct work_struct *wk); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); +void +ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); + + const char *ieee80211_get_reason_code_string(u16 reason_code); u16 ieee80211_encode_usf(int val); u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 137fa4c50e07..62c95597704b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -462,6 +462,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do sdata_unlock(sdata); cancel_work_sync(&sdata->csa_finalize_work); + cancel_work_sync(&sdata->color_change_finalize_work); cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); @@ -551,6 +552,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do */ ieee80211_free_keys(sdata, true); skb_queue_purge(&sdata->skb_queue); + skb_queue_purge(&sdata->status_queue); } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); @@ -983,6 +985,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) } skb_queue_head_init(&sdata->skb_queue); + skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); return 0; @@ -1318,13 +1321,158 @@ static void ieee80211_if_setup_no_queue(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; } +static void ieee80211_iface_process_skb(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_BACK) { + struct sta_info *sta; + int len = skb->len; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + switch (mgmt->u.action.u.addba_req.action_code) { + case WLAN_ACTION_ADDBA_REQ: + ieee80211_process_addba_request(local, sta, + mgmt, len); + break; + case WLAN_ACTION_ADDBA_RESP: + ieee80211_process_addba_resp(local, sta, + mgmt, len); + break; + case WLAN_ACTION_DELBA: + ieee80211_process_delba(sdata, sta, + mgmt, len); + break; + default: + WARN_ON(1); + break; + } + } + mutex_unlock(&local->sta_mtx); + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_VHT) { + switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_OPMODE_NOTIF: { + struct ieee80211_rx_status *status; + enum nl80211_band band; + struct sta_info *sta; + u8 opmode; + + status = IEEE80211_SKB_RXCB(skb); + band = status->band; + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + + if (sta) + ieee80211_vht_handle_opmode(sdata, sta, opmode, + band); + + mutex_unlock(&local->sta_mtx); + break; + } + case WLAN_VHT_ACTION_GROUPID_MGMT: + ieee80211_process_mu_groups(sdata, mgmt); + break; + default: + WARN_ON(1); + break; + } + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_S1G) { + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_TEARDOWN: + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_rx_twt_action(sdata, skb); + break; + default: + break; + } + } else if (ieee80211_is_ext(mgmt->frame_control)) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) + ieee80211_sta_rx_queued_ext(sdata, skb); + else + WARN_ON(1); + } else if (ieee80211_is_data_qos(mgmt->frame_control)) { + struct ieee80211_hdr *hdr = (void *)mgmt; + struct sta_info *sta; + + /* + * So the frame isn't mgmt, but frame_control + * is at the right place anyway, of course, so + * the if statement is correct. + * + * Warn if we have other data frame types here, + * they must not get here. + */ + WARN_ON(hdr->frame_control & + cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); + WARN_ON(!(hdr->seq_ctrl & + cpu_to_le16(IEEE80211_SCTL_FRAG))); + /* + * This was a fragment of a frame, received while + * a block-ack session was active. That cannot be + * right, so terminate the session. + */ + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + u16 tid = ieee80211_get_tid(hdr); + + __ieee80211_stop_rx_ba_session( + sta, tid, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_REQUIRE_SETUP, + true); + } + mutex_unlock(&local->sta_mtx); + } else switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_MESH_POINT: + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + ieee80211_mesh_rx_queued_mgmt(sdata, skb); + break; + default: + WARN(1, "frame for unexpected interface type"); + break; + } +} + +static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_S1G) { + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_TEARDOWN: + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_status_twt_action(sdata, skb); + break; + default: + break; + } + } +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, work); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; - struct sta_info *sta; if (!ieee80211_sdata_running(sdata)) return; @@ -1337,118 +1485,24 @@ static void ieee80211_iface_work(struct work_struct *work) /* first process frames */ while ((skb = skb_dequeue(&sdata->skb_queue))) { - struct ieee80211_mgmt *mgmt = (void *)skb->data; - kcov_remote_start_common(skb_get_kcov_handle(skb)); - if (ieee80211_is_action(mgmt->frame_control) && - mgmt->u.action.category == WLAN_CATEGORY_BACK) { - int len = skb->len; - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - if (sta) { - switch (mgmt->u.action.u.addba_req.action_code) { - case WLAN_ACTION_ADDBA_REQ: - ieee80211_process_addba_request( - local, sta, mgmt, len); - break; - case WLAN_ACTION_ADDBA_RESP: - ieee80211_process_addba_resp(local, sta, - mgmt, len); - break; - case WLAN_ACTION_DELBA: - ieee80211_process_delba(sdata, sta, - mgmt, len); - break; - default: - WARN_ON(1); - break; - } - } - mutex_unlock(&local->sta_mtx); - } else if (ieee80211_is_action(mgmt->frame_control) && - mgmt->u.action.category == WLAN_CATEGORY_VHT) { - switch (mgmt->u.action.u.vht_group_notif.action_code) { - case WLAN_VHT_ACTION_OPMODE_NOTIF: { - struct ieee80211_rx_status *status; - enum nl80211_band band; - u8 opmode; - - status = IEEE80211_SKB_RXCB(skb); - band = status->band; - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; - - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - - if (sta) - ieee80211_vht_handle_opmode(sdata, sta, - opmode, - band); - - mutex_unlock(&local->sta_mtx); - break; - } - case WLAN_VHT_ACTION_GROUPID_MGMT: - ieee80211_process_mu_groups(sdata, mgmt); - break; - default: - WARN_ON(1); - break; - } - } else if (ieee80211_is_ext(mgmt->frame_control)) { - if (sdata->vif.type == NL80211_IFTYPE_STATION) - ieee80211_sta_rx_queued_ext(sdata, skb); - else - WARN_ON(1); - } else if (ieee80211_is_data_qos(mgmt->frame_control)) { - struct ieee80211_hdr *hdr = (void *)mgmt; - /* - * So the frame isn't mgmt, but frame_control - * is at the right place anyway, of course, so - * the if statement is correct. - * - * Warn if we have other data frame types here, - * they must not get here. - */ - WARN_ON(hdr->frame_control & - cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); - WARN_ON(!(hdr->seq_ctrl & - cpu_to_le16(IEEE80211_SCTL_FRAG))); - /* - * This was a fragment of a frame, received while - * a block-ack session was active. That cannot be - * right, so terminate the session. - */ - mutex_lock(&local->sta_mtx); - sta = sta_info_get_bss(sdata, mgmt->sa); - if (sta) { - u16 tid = ieee80211_get_tid(hdr); + if (skb->protocol == cpu_to_be16(ETH_P_TDLS)) + ieee80211_process_tdls_channel_switch(sdata, skb); + else + ieee80211_iface_process_skb(local, sdata, skb); - __ieee80211_stop_rx_ba_session( - sta, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP, - true); - } - mutex_unlock(&local->sta_mtx); - } else switch (sdata->vif.type) { - case NL80211_IFTYPE_STATION: - ieee80211_sta_rx_queued_mgmt(sdata, skb); - break; - case NL80211_IFTYPE_ADHOC: - ieee80211_ibss_rx_queued_mgmt(sdata, skb); - break; - case NL80211_IFTYPE_MESH_POINT: - if (!ieee80211_vif_is_mesh(&sdata->vif)) - break; - ieee80211_mesh_rx_queued_mgmt(sdata, skb); - break; - default: - WARN(1, "frame for unexpected interface type"); - break; - } + kfree_skb(skb); + kcov_remote_stop(); + } + /* process status queue */ + while ((skb = skb_dequeue(&sdata->status_queue))) { + kcov_remote_start_common(skb_get_kcov_handle(skb)); + + ieee80211_iface_process_status(sdata, skb); kfree_skb(skb); + kcov_remote_stop(); } @@ -1515,9 +1569,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, } skb_queue_head_init(&sdata->skb_queue); + skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work); + INIT_WORK(&sdata->color_change_finalize_work, ieee80211_color_change_finalize_work); INIT_LIST_HEAD(&sdata->assigned_chanctx_list); INIT_LIST_HEAD(&sdata->reserved_chanctx_list); @@ -1964,6 +2020,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, } } + for (i = 0; i < IEEE80211_NUM_ACS; i++) + init_airtime_info(&sdata->airtime[i], &local->airtime[i]); + ieee80211_set_default_queues(sdata); sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -1985,9 +2044,16 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); - /* MTU range: 256 - 2304 */ + /* MTU range is normally 256 - 2304, where the upper limit is + * the maximum MSDU size. Monitor interfaces send and receive + * MPDU and A-MSDU frames which may be much larger so we do + * not impose an upper limit in that case. + */ ndev->min_mtu = 256; - ndev->max_mtu = local->hw.max_mtu; + if (type == NL80211_IFTYPE_MONITOR) + ndev->max_mtu = 0; + else + ndev->max_mtu = local->hw.max_mtu; ret = cfg80211_register_netdevice(ndev); if (ret) { diff --git a/net/mac80211/led.c b/net/mac80211/led.c index b275c8853074..6de8d0ad5497 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -259,7 +259,6 @@ static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; - struct led_classdev *led_cdev; unsigned long on, off, tpt; int i; @@ -283,10 +282,7 @@ static void tpt_trig_timer(struct timer_list *t) } } - read_lock(&local->tpt_led.leddev_list_lock); - list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) - led_blink_set(led_cdev, &on, &off); - read_unlock(&local->tpt_led.leddev_list_lock); + led_trigger_blink(&local->tpt_led, &on, &off); } const char * @@ -341,7 +337,6 @@ static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; - struct led_classdev *led_cdev; if (!tpt_trig->running) return; @@ -349,10 +344,7 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); - read_lock(&local->tpt_led.leddev_list_lock); - list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) - led_set_brightness(led_cdev, LED_OFF); - read_unlock(&local->tpt_led.leddev_list_lock); + led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index f33a3acd7f96..45fb517591ee 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -257,14 +257,15 @@ static void ieee80211_restart_work(struct work_struct *work) /* wait for scan work complete */ flush_workqueue(local->workqueue); flush_work(&local->sched_scan_stopped_work); + flush_work(&local->radar_detected_work); + + rtnl_lock(); + /* we might do interface manipulations, so need both */ + wiphy_lock(local->hw.wiphy); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); - flush_work(&local->radar_detected_work); - /* we might do interface manipulations, so need both */ - rtnl_lock(); - wiphy_lock(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { /* * XXX: there may be more work for other vif types and even @@ -706,10 +707,13 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, spin_lock_init(&local->queue_stop_reason_lock); for (i = 0; i < IEEE80211_NUM_ACS; i++) { - INIT_LIST_HEAD(&local->active_txqs[i]); - spin_lock_init(&local->active_txq_lock[i]); - local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; - local->aql_txq_limit_high[i] = + struct airtime_sched_info *air_sched = &local->airtime[i]; + + air_sched->active_txqs = RB_ROOT_CACHED; + INIT_LIST_HEAD(&air_sched->active_list); + spin_lock_init(&air_sched->lock); + air_sched->aql_txq_limit_low = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L; + air_sched->aql_txq_limit_high = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H; } @@ -739,8 +743,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); - INIT_WORK(&local->tdls_chsw_work, ieee80211_tdls_chsw_work); - spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); @@ -757,7 +759,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); - skb_queue_head_init(&local->skb_queue_tdls_chsw); ieee80211_alloc_led_names(local); @@ -1014,8 +1015,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; - if (!supp_he) - supp_he = !!ieee80211_get_he_sta_cap(sband); + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + supp_he = supp_he || iftd->he_cap.has_he; + } /* HT, VHT, HE require QoS, thus >= 4 queues */ if (WARN_ON(local->hw.queues < IEEE80211_NUM_ACS && @@ -1389,7 +1395,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) cancel_delayed_work_sync(&local->roc_work); cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); - cancel_work_sync(&local->tdls_chsw_work); flush_work(&local->sched_scan_stopped_work); flush_work(&local->radar_detected_work); @@ -1401,7 +1406,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); - skb_queue_purge(&local->skb_queue_tdls_chsw); wiphy_unregister(local->hw.wiphy); destroy_workqueue(local->workqueue); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 40492d1bd8fd..77080b4f87b8 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -134,7 +134,7 @@ struct mesh_path { * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr - * @walk_head: linked list containging all mesh_path objects + * @walk_head: linked list containing all mesh_path objects * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 3db514c4c63a..a05b615deb51 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1124,7 +1124,7 @@ enddiscovery: * forwarding information is found. * * Returns: 0 if the next hop was found and -ENOENT if the frame was queued. - * skb is freeed here if no mpath could be allocated. + * skb is freed here if no mpath could be allocated. */ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 620ecf922408..efbefcbac3ac 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -122,7 +122,7 @@ static void prepare_for_gate(struct sk_buff *skb, char *dst_addr, hdr = (struct ieee80211_hdr *) skb->data; /* we preserve the previous mesh header and only add - * the new addreses */ + * the new addresses */ mshdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); mshdr->flags = MESH_FLAGS_AE_A5_A6; memcpy(mshdr->eaddr1, hdr->addr3, ETH_ALEN); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index aca26df7587d..a6915847d78a 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -150,7 +150,7 @@ out: * mesh STA in a MBSS. Three HT protection modes are supported for now, non-HT * mixed mode, 20MHz-protection and no-protection mode. non-HT mixed mode is * selected if any non-HT peers are present in our MBSS. 20MHz-protection mode - * is selected if all peers in our 20/40MHz MBSS support HT and atleast one + * is selected if all peers in our 20/40MHz MBSS support HT and at least one * HT20 peer is present. Otherwise no-protection mode is selected. */ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3f2aad2e7436..c0ea3b1aa9e1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2020 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #include <linux/delay.h> @@ -371,7 +371,6 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; - enum ieee80211_sta_rx_bandwidth new_sta_bw; u32 vht_cap_info = 0; int ret; @@ -385,7 +384,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, /* don't check HE if we associated as non-HE station */ if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || - !ieee80211_get_he_sta_cap(sband)) + !ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) + he_oper = NULL; if (WARN_ON_ONCE(!sta)) @@ -445,40 +446,13 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, IEEE80211_STA_DISABLE_160MHZ)) || !cfg80211_chandef_valid(&chandef)) { sdata_info(sdata, - "AP %pM changed bandwidth in a way we can't support - disconnect\n", - ifmgd->bssid); - return -EINVAL; - } - - switch (chandef.width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - new_sta_bw = IEEE80211_STA_RX_BW_20; - break; - case NL80211_CHAN_WIDTH_40: - new_sta_bw = IEEE80211_STA_RX_BW_40; - break; - case NL80211_CHAN_WIDTH_80: - new_sta_bw = IEEE80211_STA_RX_BW_80; - break; - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: - new_sta_bw = IEEE80211_STA_RX_BW_160; - break; - default: + "AP %pM changed caps/bw in a way we can't support (0x%x/0x%x) - disconnect\n", + ifmgd->bssid, flags, ifmgd->flags); return -EINVAL; } - if (new_sta_bw > sta->cur_max_bandwidth) - new_sta_bw = sta->cur_max_bandwidth; - - if (new_sta_bw < sta->sta.bandwidth) { - sta->sta.bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, - IEEE80211_RC_BW_CHANGED); - } - ret = ieee80211_vif_change_bandwidth(sdata, &chandef, changed); + if (ret) { sdata_info(sdata, "AP %pM changed bandwidth to incompatible one - disconnect\n", @@ -486,12 +460,6 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return ret; } - if (new_sta_bw > sta->sta.bandwidth) { - sta->sta.bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, - IEEE80211_RC_BW_CHANGED); - } - return 0; } @@ -617,7 +585,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; /* - * If some other vif is using the MU-MIMO capablity we cannot associate + * If some other vif is using the MU-MIMO capability we cannot associate * using MU-MIMO - this will lead to contradictions in the group-id * mechanism. * Ownership is defined since association request, in order to avoid @@ -676,7 +644,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); - he_cap = ieee80211_get_he_sta_cap(sband); + he_cap = ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); if (!he_cap || !reg_cap) return; @@ -712,6 +681,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) u32 rates = 0; __le16 listen_int; struct element *ext_capa = NULL; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + const struct ieee80211_sband_iftype_data *iftd; + struct ieee80211_prep_tx_info info = {}; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) @@ -756,6 +728,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) } } + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + skb = alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + /* bit too much but doesn't matter */ 2 + assoc_data->ssid_len + /* SSID */ @@ -770,7 +744,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + - 9, /* WMM */ + 9 + /* WMM */ + (iftd ? iftd->vendor_elems.len : 0), GFP_KERNEL); if (!skb) return; @@ -810,12 +785,14 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) mgmt->u.reassoc_req.listen_interval = listen_int; memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_bssid, ETH_ALEN); + info.subtype = IEEE80211_STYPE_REASSOC_REQ; } else { skb_put(skb, 4); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ); mgmt->u.assoc_req.capab_info = cpu_to_le16(capab); mgmt->u.assoc_req.listen_interval = listen_int; + info.subtype = IEEE80211_STYPE_ASSOC_REQ; } /* SSID */ @@ -1043,6 +1020,9 @@ skip_rates: ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); } + if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) + skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); + /* add any remaining custom (i.e. vendor specific here) IEs */ if (assoc_data->ie_len) { noffset = assoc_data->ie_len; @@ -1060,7 +1040,7 @@ skip_rates: ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); ifmgd->assoc_req_ies_len = pos - ie_start; - drv_mgd_prepare_tx(local, sdata, 0); + drv_mgd_prepare_tx(local, sdata, &info); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -1094,11 +1074,6 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - /* Don't send NDPs when STA is connected HE */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) - return; - skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -1120,8 +1095,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, ieee80211_tx_skb(sdata, skb); } -static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { struct sk_buff *skb; struct ieee80211_hdr *nullfunc; @@ -1130,10 +1105,6 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; - /* Don't send NDPs when connected HE */ - if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) - return; - skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; @@ -1183,10 +1154,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) */ if (sdata->reserved_chanctx) { - struct ieee80211_supported_band *sband = NULL; - struct sta_info *mgd_sta = NULL; - enum ieee80211_sta_rx_bandwidth bw = IEEE80211_STA_RX_BW_20; - /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their @@ -1195,48 +1162,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) if (sdata->reserved_ready) goto out; - if (sdata->vif.bss_conf.chandef.width != - sdata->csa_chandef.width) { - /* - * For managed interface, we need to also update the AP - * station bandwidth and align the rate scale algorithm - * on the bandwidth change. Here we only consider the - * bandwidth of the new channel definition (as channel - * switch flow does not have the full HT/VHT/HE - * information), assuming that if additional changes are - * required they would be done as part of the processing - * of the next beacon from the AP. - */ - switch (sdata->csa_chandef.width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - default: - bw = IEEE80211_STA_RX_BW_20; - break; - case NL80211_CHAN_WIDTH_40: - bw = IEEE80211_STA_RX_BW_40; - break; - case NL80211_CHAN_WIDTH_80: - bw = IEEE80211_STA_RX_BW_80; - break; - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: - bw = IEEE80211_STA_RX_BW_160; - break; - } - - mgd_sta = sta_info_get(sdata, ifmgd->bssid); - sband = - local->hw.wiphy->bands[sdata->csa_chandef.chan->band]; - } - - if (sdata->vif.bss_conf.chandef.width > - sdata->csa_chandef.width) { - mgd_sta->sta.bandwidth = bw; - rate_control_rate_update(local, sband, mgd_sta, - IEEE80211_RC_BW_CHANGED); - } - ret = ieee80211_vif_use_reserved_context(sdata); if (ret) { sdata_info(sdata, @@ -1247,13 +1172,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) goto out; } - if (sdata->vif.bss_conf.chandef.width < - sdata->csa_chandef.width) { - mgd_sta->sta.bandwidth = bw; - rate_control_rate_update(local, sband, mgd_sta, - IEEE80211_RC_BW_CHANGED); - } - goto out; } @@ -2341,6 +2259,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; u32 changed = 0; + struct ieee80211_prep_tx_info info = { + .subtype = stype, + }; sdata_assert_lock(sdata); @@ -2390,8 +2311,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, * driver requested so. */ if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && - !ifmgd->have_beacon) - drv_mgd_prepare_tx(sdata->local, sdata, 0); + !ifmgd->have_beacon) { + drv_mgd_prepare_tx(sdata->local, sdata, &info); + } ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, ifmgd->bssid, stype, reason, @@ -2402,6 +2324,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (tx) ieee80211_flush_queues(local, sdata, false); + drv_mgd_complete_tx(sdata->local, sdata, &info); + /* clear bssid only after building the needed mgmt frames */ eth_zero_addr(ifmgd->bssid); @@ -2617,10 +2541,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) - ifmgd->probe_send_count--; - else - ieee80211_send_nullfunc(sdata->local, sdata, false); + ieee80211_send_nullfunc(sdata->local, sdata, false); } else { int ssid_len; @@ -2952,6 +2873,9 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, u8 *pos; struct ieee802_11_elems elems; u32 tx_flags = 0; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; pos = mgmt->u.auth.variable; ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, @@ -2959,7 +2883,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, if (!elems.challenge) return; auth_data->expected_transaction = 4; - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -3012,6 +2936,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, }; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; sdata_assert_lock(sdata); @@ -3040,7 +2967,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, ifmgd->auth_data->expected_transaction); - return; + goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { @@ -3051,7 +2978,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || status_code == WLAN_STATUS_SAE_PK)))) - return; + goto notify_driver; sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); @@ -3059,7 +2986,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); - return; + goto notify_driver; } switch (ifmgd->auth_data->algorithm) { @@ -3081,10 +3008,11 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, default: WARN_ONCE(1, "invalid auth alg %d", ifmgd->auth_data->algorithm); - return; + goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; + info.success = 1; drv_event_callback(sdata->local, sdata, &event); if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && @@ -3098,6 +3026,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); +notify_driver: + drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ @@ -3314,6 +3244,23 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, return 0; } +static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *bss_conf, + struct ieee80211_supported_band *sband, + struct sta_info *sta) +{ + const struct ieee80211_sta_he_cap *own_he_cap = + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); + + return bss_conf->he_support && + (sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_BCAST_TWT) && + own_he_cap && + (own_he_cap->he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_BCAST_TWT); +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len, @@ -3529,6 +3476,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->twt_protected = false; } + bss_conf->twt_broadcast = + ieee80211_twt_bcast_support(sdata, bss_conf, sband, sta); + if (bss_conf->he_support) { bss_conf->he_bss_color.color = le32_get_bits(elems->he_operation->he_oper_params, @@ -3699,6 +3649,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, }; + struct ieee80211_prep_tx_info info = {}; sdata_assert_lock(sdata); @@ -3728,6 +3679,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, aid = 0; /* TODO */ } + /* + * Note: this may not be perfect, AP might misbehave - if + * anyone needs to rely on perfect complete notification + * with the exact right subtype, then we need to track what + * we actually transmitted. + */ + info.subtype = reassoc ? IEEE80211_STYPE_REASSOC_REQ : + IEEE80211_STYPE_ASSOC_REQ; + sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", reassoc ? "Rea" : "A", mgmt->sa, @@ -3753,7 +3713,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); - return; + goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { @@ -3768,7 +3728,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, cbss); - return; + goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); @@ -3786,10 +3746,14 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (sdata->tx_conf[ac].uapsd) uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; + + info.success = 1; } cfg80211_rx_assoc_resp(sdata->dev, cbss, (u8 *)mgmt, len, uapsd_queues, ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len); +notify_driver: + drv_mgd_complete_tx(sdata->local, sdata, &info); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, @@ -4408,7 +4372,9 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) u32 tx_flags = 0; u16 trans = 1; u16 status = 0; - u16 prepare_tx_duration = 0; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_AUTH, + }; sdata_assert_lock(sdata); @@ -4431,10 +4397,9 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) } if (auth_data->algorithm == WLAN_AUTH_SAE) - prepare_tx_duration = - jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); + info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); - drv_mgd_prepare_tx(local, sdata, prepare_tx_duration); + drv_mgd_prepare_tx(local, sdata, &info); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->bss->bssid, auth_data->tries, @@ -4929,11 +4894,13 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, } static bool -ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, +ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, const struct ieee80211_he_operation *he_op) { const struct ieee80211_sta_he_cap *sta_he_cap = - ieee80211_get_he_sta_cap(sband); + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); u16 ap_min_req_set; int i; @@ -5027,7 +4994,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - if (!ieee80211_get_he_sta_cap(sband)) + if (!ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif))) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; rcu_read_lock(); @@ -5085,7 +5053,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, else he_oper = NULL; - if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + if (!ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } @@ -5655,15 +5623,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, 2 * FILS_NONCE_LEN); assoc_data->bss = req->bss; - - if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { - if (ifmgd->powersave) - sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; - else - sdata->smps_mode = IEEE80211_SMPS_OFF; - } else - sdata->smps_mode = ifmgd->req_smps; - assoc_data->capability = req->bss->capability; assoc_data->supp_rates = bss->supp_rates; assoc_data->supp_rates_len = bss->supp_rates_len; @@ -5770,6 +5729,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (err) goto err_clear; + if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; + else + sdata->smps_mode = IEEE80211_SMPS_OFF; + } else { + sdata->smps_mode = ifmgd->req_smps; + } + rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); @@ -5854,6 +5822,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; + struct ieee80211_prep_tx_info info = { + .subtype = IEEE80211_STYPE_DEAUTH, + }; if (ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->bss->bssid, req->bssid)) { @@ -5862,7 +5833,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, @@ -5871,7 +5842,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); - + drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } @@ -5882,7 +5853,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata, 0); + drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, @@ -5906,6 +5877,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); + drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 63652c39c8e0..e5935e3d7a07 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -297,15 +297,11 @@ void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata) static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - __le16 fc; - - fc = hdr->frame_control; return (info->flags & (IEEE80211_TX_CTL_NO_ACK | IEEE80211_TX_CTL_USE_MINRATE)) || - !ieee80211_is_data(fc); + !ieee80211_is_tx_data(skb); } static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate, @@ -396,6 +392,10 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (ieee80211_is_tx_data(txrc->skb) && + info->flags & IEEE80211_TX_CTL_NO_ACK) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -870,7 +870,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates) { struct ieee80211_sub_if_data *sdata; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; @@ -882,7 +881,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, sdata = vif_to_sdata(vif); sband = sdata->local->hw.wiphy->bands[info->band]; - if (ieee80211_is_data(hdr->frame_control)) + if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index a6f3fb4a9197..72b44d4c42d0 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -434,7 +434,7 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, unsigned int nsecs = 0, overhead = mi->overhead; unsigned int ampdu_len = 1; - /* do not account throughput if sucess prob is below 10% */ + /* do not account throughput if success prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; @@ -1176,29 +1176,6 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) } static void -minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - u16 tid; - - if (skb_get_queue_mapping(skb) == IEEE80211_AC_VO) - return; - - if (unlikely(!ieee80211_is_data_qos(hdr->frame_control))) - return; - - if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) - return; - - tid = ieee80211_get_tid(hdr); - if (likely(sta->ampdu_mlme.tid_tx[tid])) - return; - - ieee80211_start_tx_ba_session(pubsta, tid, 0); -} - -static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) { @@ -1211,6 +1188,10 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, bool last, update = false; int i; + /* Ignore packet that was sent with noAck flag */ + if (info->flags & IEEE80211_TX_CTL_NO_ACK) + return; + /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) @@ -1498,10 +1479,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct minstrel_priv *mp = priv; u16 sample_idx; - if (!(info->flags & IEEE80211_TX_CTL_AMPDU) && - !minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_prob_rate))) - minstrel_aggr_check(sta, txrc->skb); - info->flags |= mi->tx_flags; #ifdef CONFIG_MAC80211_DEBUGFS @@ -1907,6 +1884,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", + .capa = RATE_CTRL_CAPA_AMPDU_TRIGGER, .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index af0ef456eb0f..99ed68f7dc36 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -214,6 +214,24 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, return len; } +static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + if (sta) + sta->rx_stats.packets++; +} + +static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + skb->protocol = 0; + __ieee80211_queue_skb_to_iface(sdata, sta, skb); +} + static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int rtap_space) @@ -254,8 +272,7 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, if (!skb) return; - skb_queue_tail(&sdata->skb_queue, skb); - ieee80211_queue_work(&sdata->local->hw, &sdata->work); + ieee80211_queue_skb_to_iface(sdata, NULL, skb); } /* @@ -342,7 +359,12 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, put_unaligned_le32(it_present_val, it_present); - pos = (void *)(it_present + 1); + /* This references through an offset into it_optional[] rather + * than via it_present otherwise later uses of pos will cause + * the compiler to think we have walked past the end of the + * struct member. + */ + pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional]; /* the order of the following fields is important */ @@ -355,7 +377,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, ieee80211_calculate_rx_timestamp(local, status, mpdulen, 0), pos); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_TSFT)); pos += 8; } @@ -379,7 +401,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos = 0; } else { int shift = 0; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); if (status->bw == RATE_INFO_BW_10) shift = 1; else if (status->bw == RATE_INFO_BW_5) @@ -416,7 +438,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { *pos = status->signal; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL)); pos++; } @@ -442,7 +464,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (status->encoding == RX_ENC_HT) { unsigned int stbc; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); *pos++ = local->hw.radiotap_mcs_details; *pos = 0; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) @@ -466,7 +488,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, while ((pos - (u8 *)rthdr) & 3) pos++; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_AMPDU_STATUS)); put_unaligned_le32(status->ampdu_reference, pos); pos += 4; if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN) @@ -493,7 +515,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (status->encoding == RX_ENC_VHT) { u16 known = local->hw.radiotap_vht_details; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); put_unaligned_le16(known, pos); pos += 2; /* flags */ @@ -537,7 +559,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_TIMESTAMP)); /* ensure 8 byte alignment */ while ((pos - (u8 *)rthdr) & 7) @@ -625,7 +647,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); memcpy(pos, &he, sizeof(he)); pos += sizeof(he); } @@ -635,14 +657,14 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE_MU)); memcpy(pos, &he_mu, sizeof(he_mu)); pos += sizeof(he_mu); } if (status->flag & RX_FLAG_NO_PSDU) { rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_ZERO_LEN_PSDU)); *pos++ = status->zero_length_psdu_type; } @@ -650,7 +672,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_LSIG)); memcpy(pos, &lsig, sizeof(lsig)); pos += sizeof(lsig); } @@ -713,7 +735,8 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local, * Need to make a copy and possibly remove radiotap header * and FCS from the original. */ - skb = skb_copy_expand(*origskb, needed_headroom, 0, GFP_ATOMIC); + skb = skb_copy_expand(*origskb, needed_headroom + NET_SKB_PAD, + 0, GFP_ATOMIC); if (!skb) return NULL; @@ -1339,7 +1362,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { struct sk_buff *skb = rx->skb; - struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct sta_info *sta = rx->sta; struct tid_ampdu_rx *tid_agg_rx; @@ -1391,8 +1413,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, /* if this mpdu is fragmented - terminate rx aggregation session */ sc = le16_to_cpu(hdr->seq_ctrl); if (sc & IEEE80211_SCTL_FRAG) { - skb_queue_tail(&rx->sdata->skb_queue, skb); - ieee80211_queue_work(&local->hw, &rx->sdata->work); + ieee80211_queue_skb_to_iface(rx->sdata, NULL, skb); return; } @@ -1563,12 +1584,8 @@ static void sta_ps_start(struct sta_info *sta) for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { struct ieee80211_txq *txq = sta->sta.txq[tid]; - struct txq_info *txqi = to_txq_info(txq); - spin_lock(&local->active_txq_lock[txq->ac]); - if (!list_empty(&txqi->schedule_order)) - list_del_init(&txqi->schedule_order); - spin_unlock(&local->active_txq_lock[txq->ac]); + ieee80211_unschedule_txq(&local->hw, txq, false); if (txq_has_queue(txq)) set_bit(tid, &sta->txq_buffered_tids); @@ -3009,11 +3026,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) tf->category == WLAN_CATEGORY_TDLS && (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST || tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) { - skb_queue_tail(&local->skb_queue_tdls_chsw, rx->skb); - schedule_work(&local->tdls_chsw_work); - if (rx->sta) - rx->sta->rx_stats.packets++; - + rx->skb->protocol = cpu_to_be16(ETH_P_TDLS); + __ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } } @@ -3198,6 +3212,68 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static bool +ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + struct ieee80211_sub_if_data *sdata = rx->sdata; + const struct ieee80211_sta_he_cap *hecap; + struct ieee80211_supported_band *sband; + + /* TWT actions are only supported in AP for the moment */ + if (sdata->vif.type != NL80211_IFTYPE_AP) + return false; + + if (!rx->local->ops->add_twt_setup) + return false; + + sband = rx->local->hw.wiphy->bands[status->band]; + hecap = ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); + if (!hecap) + return false; + + if (!(hecap->he_cap_elem.mac_cap_info[0] & + IEEE80211_HE_MAC_CAP0_TWT_RES)) + return false; + + if (!rx->sta) + return false; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: { + struct ieee80211_twt_setup *twt; + + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + + 1 + /* action code */ + sizeof(struct ieee80211_twt_setup) + + 2 /* TWT req_type agrt */) + break; + + twt = (void *)mgmt->u.action.u.s1g.variable; + if (twt->element_id != WLAN_EID_S1G_TWT) + break; + + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + + 4 + /* action code + token + tlv */ + twt->length) + break; + + return true; /* queue the frame */ + } + case WLAN_S1G_TWT_TEARDOWN: + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2) + break; + + return true; /* queue the frame */ + default: + break; + } + + return false; +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { @@ -3477,6 +3553,17 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) !mesh_path_sel_is_hwmp(sdata)) break; goto queue; + case WLAN_CATEGORY_S1G: + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + case WLAN_S1G_TWT_TEARDOWN: + if (ieee80211_process_rx_twt_action(rx)) + goto queue; + break; + default: + break; + } + break; } return RX_CONTINUE; @@ -3493,10 +3580,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) return RX_QUEUED; queue: - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } @@ -3644,10 +3728,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; /* for now only beacons are ext, so queue them */ - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } @@ -3704,11 +3785,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; } - /* queue up frame and kick off work to process it */ - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_stats.packets++; + ieee80211_queue_skb_to_iface(sdata, rx->sta, rx->skb); return RX_QUEUED; } diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index c33f332b049a..7e35ab5b6166 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -6,6 +6,7 @@ #include <linux/ieee80211.h> #include <net/mac80211.h> #include "ieee80211_i.h" +#include "driver-ops.h" void ieee80211_s1g_sta_rate_init(struct sta_info *sta) { @@ -14,3 +15,182 @@ void ieee80211_s1g_sta_rate_init(struct sta_info *sta) sta->rx_stats.last_rate = STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_S1G); } + +bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + + if (likely(!ieee80211_is_action(mgmt->frame_control))) + return false; + + if (likely(mgmt->u.action.category != WLAN_CATEGORY_S1G)) + return false; + + return mgmt->u.action.u.s1g.action_code == WLAN_S1G_TWT_SETUP; +} + +static void +ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, + const u8 *bssid, struct ieee80211_twt_setup *twt) +{ + int len = IEEE80211_MIN_ACTION_SIZE + 4 + twt->length; + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + len); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_S1G; + mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_SETUP; + memcpy(mgmt->u.action.u.s1g.variable, twt, 3 + twt->length); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_INTFL_MLME_CONN_TX | + IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); +} + +static void +ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, + const u8 *da, const u8 *bssid, u8 flowid) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + u8 *id; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + IEEE80211_MIN_ACTION_SIZE + 2); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE + 2); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_S1G; + mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_TEARDOWN; + id = (u8 *)mgmt->u.action.u.s1g.variable; + *id = flowid; + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); +} + +static void +ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_params *twt_agrt = (void *)twt->params; + + twt_agrt->req_type &= cpu_to_le16(~IEEE80211_TWT_REQTYPE_REQUEST); + + /* broadcast TWT not supported yet */ + if (twt->control & IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST) { + le16p_replace_bits(&twt_agrt->req_type, + TWT_SETUP_CMD_REJECT, + IEEE80211_TWT_REQTYPE_SETUP_CMD); + goto out; + } + + drv_add_twt_setup(sdata->local, sdata, &sta->sta, twt); +out: + ieee80211_s1g_send_twt_setup(sdata, mgmt->sa, sdata->vif.addr, twt); +} + +static void +ieee80211_s1g_rx_twt_teardown(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + + drv_twt_teardown_request(sdata->local, sdata, &sta->sta, + mgmt->u.action.u.s1g.variable[0]); +} + +static void +ieee80211_s1g_tx_twt_setup_fail(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_params *twt_agrt = (void *)twt->params; + u8 flowid = le16_get_bits(twt_agrt->req_type, + IEEE80211_TWT_REQTYPE_FLOWID); + + drv_twt_teardown_request(sdata->local, sdata, &sta->sta, flowid); + + ieee80211_s1g_send_twt_teardown(sdata, mgmt->sa, sdata->vif.addr, + flowid); +} + +void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + mutex_lock(&local->sta_mtx); + + sta = sta_info_get_bss(sdata, mgmt->sa); + if (!sta) + goto out; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_rx_twt_setup(sdata, sta, skb); + break; + case WLAN_S1G_TWT_TEARDOWN: + ieee80211_s1g_rx_twt_teardown(sdata, sta, skb); + break; + default: + break; + } + +out: + mutex_unlock(&local->sta_mtx); +} + +void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + mutex_lock(&local->sta_mtx); + + sta = sta_info_get_bss(sdata, mgmt->da); + if (!sta) + goto out; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + /* process failed twt setup frames */ + ieee80211_s1g_tx_twt_setup_fail(sdata, sta, skb); + break; + default: + break; + } + +out: + mutex_unlock(&local->sta_mtx); +} diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f2fb69da9b6e..2b5acb37587f 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -425,15 +425,11 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; - sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); - sta->airtime[i].deficit = sta->airtime_weight; - atomic_set(&sta->airtime[i].aql_tx_pending, 0); - sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; - sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; + init_airtime_info(&sta->airtime[i], &local->airtime[i]); } for (i = 0; i < IEEE80211_NUM_TIDS; i++) @@ -547,7 +543,7 @@ static int sta_info_insert_check(struct sta_info *sta) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || - is_multicast_ether_addr(sta->sta.addr))) + !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to @@ -1398,11 +1394,6 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; - /* Don't send NDPs when STA is connected HE */ - if (sdata->vif.type == NL80211_IFTYPE_STATION && - !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) - return; - if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | @@ -1897,24 +1888,59 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(ieee80211_sta_set_buffered); -void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, - u32 tx_airtime, u32 rx_airtime) +void ieee80211_register_airtime(struct ieee80211_txq *txq, + u32 tx_airtime, u32 rx_airtime) { - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - struct ieee80211_local *local = sta->sdata->local; - u8 ac = ieee80211_ac_from_tid(tid); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); + struct ieee80211_local *local = sdata->local; + u64 weight_sum, weight_sum_reciprocal; + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; u32 airtime = 0; - if (sta->local->airtime_flags & AIRTIME_USE_TX) + air_sched = &local->airtime[txq->ac]; + air_info = to_airtime_info(txq); + + if (local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; - if (sta->local->airtime_flags & AIRTIME_USE_RX) + if (local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; - spin_lock_bh(&local->active_txq_lock[ac]); - sta->airtime[ac].tx_airtime += tx_airtime; - sta->airtime[ac].rx_airtime += rx_airtime; - sta->airtime[ac].deficit -= airtime; - spin_unlock_bh(&local->active_txq_lock[ac]); + /* Weights scale so the unit weight is 256 */ + airtime <<= 8; + + spin_lock_bh(&air_sched->lock); + + air_info->tx_airtime += tx_airtime; + air_info->rx_airtime += rx_airtime; + + if (air_sched->weight_sum) { + weight_sum = air_sched->weight_sum; + weight_sum_reciprocal = air_sched->weight_sum_reciprocal; + } else { + weight_sum = air_info->weight; + weight_sum_reciprocal = air_info->weight_reciprocal; + } + + /* Round the calculation of global vt */ + air_sched->v_t += (u64)((airtime + (weight_sum >> 1)) * + weight_sum_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_64; + air_info->v_t += (u32)((airtime + (air_info->weight >> 1)) * + air_info->weight_reciprocal) >> IEEE80211_RECIPROCAL_SHIFT_32; + ieee80211_resort_txq(&local->hw, txq); + + spin_unlock_bh(&air_sched->lock); +} + +void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, + u32 tx_airtime, u32 rx_airtime) +{ + struct ieee80211_txq *txq = pubsta->txq[tid]; + + if (!txq) + return; + + ieee80211_register_airtime(txq, tx_airtime, rx_airtime); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); @@ -2093,10 +2119,9 @@ static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; - struct ieee80211_local *local = sta->local; int cpu; - if (!ieee80211_hw_check(&local->hw, USES_RSS)) + if (!sta->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { @@ -2196,9 +2221,7 @@ static void sta_set_tidstats(struct sta_info *sta, int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - if (!ieee80211_hw_check(&local->hw, USES_RSS)) - tidstats->rx_msdu += - sta_get_tidstats_msdu(&sta->rx_stats, tid); + tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->rx_stats, tid); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2277,7 +2300,6 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->rx_beacon = sdata->u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); - sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | @@ -2312,8 +2334,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { - if (!ieee80211_hw_check(&local->hw, USES_RSS)) - sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); + sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2363,7 +2384,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { - sinfo->airtime_weight = sta->airtime_weight; + sinfo->airtime_weight = sta->airtime[0].weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 0333072ebd98..ba2796782008 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -135,18 +135,25 @@ enum ieee80211_agg_stop_reason { #define AIRTIME_USE_TX BIT(0) #define AIRTIME_USE_RX BIT(1) + struct airtime_info { u64 rx_airtime; u64 tx_airtime; - s64 deficit; + u64 v_t; + u64 last_scheduled; + struct list_head list; atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ u32 aql_limit_low; u32 aql_limit_high; + u32 weight_reciprocal; + u16 weight; }; void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed); +void ieee80211_register_airtime(struct ieee80211_txq *txq, + u32 tx_airtime, u32 rx_airtime); struct sta_info; @@ -515,7 +522,6 @@ struct ieee80211_fragment_cache { * @tid_seq: per-TID sequence numbers for sending to this STA * @airtime: per-AC struct airtime_info describing airtime statistics for this * station - * @airtime_weight: station weight for airtime fairness calculation purposes * @ampdu_mlme: A-MPDU state machine state * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry @@ -646,7 +652,6 @@ struct sta_info { u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; struct airtime_info airtime[IEEE80211_NUM_ACS]; - u16 airtime_weight; /* * Aggregation information, locked with lock. diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9baf185ee4c7..f6f63a0b1b72 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -305,8 +305,8 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = - cpu_to_le32((1 << IEEE80211_RADIOTAP_TX_FLAGS) | - (1 << IEEE80211_RADIOTAP_DATA_RETRIES)); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | + BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* @@ -331,7 +331,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, sband->bitrates[info->status.rates[0].idx].bitrate; if (legacy_rate) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; @@ -358,7 +358,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_MCS)) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; @@ -374,7 +374,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -419,7 +419,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -495,7 +495,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; @@ -512,7 +512,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -705,13 +705,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb, hdr_size) == - WLAN_TDLS_TEARDOWN)) + WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); - else + } else if (ieee80211_s1g_is_twt_setup(skb)) { + if (!acked) { + struct sk_buff *qskb; + + qskb = skb_clone(skb, GFP_ATOMIC); + if (qskb) { + skb_queue_tail(&sdata->status_queue, + qskb); + ieee80211_queue_work(&local->hw, + &sdata->work); + } + } + } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); + } } rcu_read_unlock(); @@ -970,6 +983,25 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); + } else if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { + struct ieee80211_sub_if_data *sdata; + struct ieee80211_txq *txq; + u32 airtime; + + /* Account airtime to multicast queue */ + sdata = ieee80211_sdata_from_skb(local, skb); + + if (sdata && (txq = sdata->vif.txq)) { + airtime = info->status.tx_time ?: + ieee80211_calc_expected_tx_airtime(hw, + &sdata->vif, + NULL, + skb->len, + false); + + ieee80211_register_airtime(txq, airtime, 0); + } } /* SNMP counters @@ -1006,12 +1038,11 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { - if (info->flags & IEEE80211_TX_STAT_ACK) { + if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; - } else - mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(10)); + mod_timer(&local->dynamic_ps_timer, + jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f91d02b81b92..45e532ad1215 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1920,7 +1920,7 @@ out: return ret; } -static void +void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { @@ -1971,32 +1971,6 @@ void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) rcu_read_unlock(); } -void ieee80211_tdls_chsw_work(struct work_struct *wk) -{ - struct ieee80211_local *local = - container_of(wk, struct ieee80211_local, tdls_chsw_work); - struct ieee80211_sub_if_data *sdata; - struct sk_buff *skb; - struct ieee80211_tdls_data *tf; - - wiphy_lock(local->hw.wiphy); - while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { - tf = (struct ieee80211_tdls_data *)skb->data; - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata) || - sdata->vif.type != NL80211_IFTYPE_STATION || - !ether_addr_equal(tf->da, sdata->vif.addr)) - continue; - - ieee80211_process_tdls_channel_switch(sdata, skb); - break; - } - - kfree_skb(skb); - } - wiphy_unlock(local->hw.wiphy); -} - void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8fcc39056402..9e8381bef7ed 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH -* Copyright (C) 2018 - 2020 Intel Corporation +* Copyright (C) 2018 - 2021 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -1461,31 +1461,52 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames, TP_ARGS(local, sta, tids, num_frames, reason, more_data) ); -TRACE_EVENT(drv_mgd_prepare_tx, +DECLARE_EVENT_CLASS(mgd_prepare_complete_tx_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - u16 duration), + u16 duration, u16 subtype, bool success), - TP_ARGS(local, sdata, duration), + TP_ARGS(local, sdata, duration, subtype, success), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, duration) + __field(u16, subtype) + __field(u8, success) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->duration = duration; + __entry->subtype = subtype; + __entry->success = success; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " duration: %u", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration + LOCAL_PR_FMT VIF_PR_FMT " duration: %u, subtype:0x%x, success:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration, + __entry->subtype, __entry->success ) ); +DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_prepare_tx, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 duration, u16 subtype, bool success), + + TP_ARGS(local, sdata, duration, subtype, success) +); + +DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_complete_tx, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 duration, u16 subtype, bool success), + + TP_ARGS(local, sdata, duration, subtype, success) +); + DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), @@ -2804,6 +2825,73 @@ DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload, TP_ARGS(local, sdata, sta, enabled) ); +TRACE_EVENT(drv_add_twt_setup, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, + struct ieee80211_twt_setup *twt, + struct ieee80211_twt_params *twt_agrt), + + TP_ARGS(local, sta, twt, twt_agrt), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, dialog_token) + __field(u8, control) + __field(__le16, req_type) + __field(__le64, twt) + __field(u8, duration) + __field(__le16, mantissa) + __field(u8, channel) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->dialog_token = twt->dialog_token; + __entry->control = twt->control; + __entry->req_type = twt_agrt->req_type; + __entry->twt = twt_agrt->twt; + __entry->duration = twt_agrt->min_twt_dur; + __entry->mantissa = twt_agrt->mantissa; + __entry->channel = twt_agrt->channel; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT + " token:%d control:0x%02x req_type:0x%04x" + " twt:%llu duration:%d mantissa:%d channel:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->dialog_token, + __entry->control, le16_to_cpu(__entry->req_type), + le64_to_cpu(__entry->twt), __entry->duration, + le16_to_cpu(__entry->mantissa), __entry->channel + ) +); + +TRACE_EVENT(drv_twt_teardown_request, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, u8 flowid), + + TP_ARGS(local, sta, flowid), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, flowid) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->flowid = flowid; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " flowid:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->flowid + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2651498d05e8..2d1193ed3eb5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -18,6 +18,7 @@ #include <linux/bitmap.h> #include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> @@ -666,6 +667,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) u32 len; struct ieee80211_tx_rate_control txrc; struct ieee80211_sta_rates *ratetbl = NULL; + bool encap = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP; bool assoc = false; memset(&txrc, 0, sizeof(txrc)); @@ -707,7 +709,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) * just wants a probe response. */ if (tx->sdata->vif.bss_conf.use_short_preamble && - (ieee80211_is_data(hdr->frame_control) || + (ieee80211_is_tx_data(tx->skb) || (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) txrc.short_preamble = true; @@ -729,7 +731,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) "%s: Dropped data frame as no usable bitrate found while " "scanning and associated. Target station: " "%pM on %d GHz band\n", - tx->sdata->name, hdr->addr1, + tx->sdata->name, + encap ? ((struct ethhdr *)hdr)->h_dest : hdr->addr1, info->band ? 5 : 2)) return TX_DROP; @@ -763,7 +766,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) if (txrc.reported_rate.idx < 0) { txrc.reported_rate = tx->rate; - if (tx->sta && ieee80211_is_data(hdr->frame_control)) + if (tx->sta && ieee80211_is_tx_data(tx->skb)) tx->sta->tx_stats.last_rate = txrc.reported_rate; } else if (tx->sta) tx->sta->tx_stats.last_rate = txrc.reported_rate; @@ -1144,6 +1147,29 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, return queued; } +static void +ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct rate_control_ref *ref = sdata->local->rate_ctrl; + u16 tid; + + if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) + return; + + if (!sta || !sta->sta.ht_cap.ht_supported || + !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || + skb->protocol == sdata->control_port_protocol) + return; + + tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + if (likely(sta->ampdu_mlme.tid_tx[tid])) + return; + + ieee80211_start_tx_ba_session(&sta->sta, tid, 0); +} + /* * initialises @tx * pass %NULL for the station if unknown, a valid pointer if known @@ -1157,6 +1183,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + bool aggr_check = false; int tid; memset(tx, 0, sizeof(*tx)); @@ -1185,8 +1212,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } - if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) + if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) { tx->sta = sta_info_get(sdata, hdr->addr1); + aggr_check = true; + } } if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && @@ -1196,8 +1225,12 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_tx *tid_tx; tid = ieee80211_get_tid(hdr); - tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); + if (!tid_tx && aggr_check) { + ieee80211_aggr_check(sdata, tx->sta, skb); + tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); + } + if (tid_tx) { bool queued; @@ -1447,7 +1480,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, codel_vars_init(&txqi->def_cvars); codel_stats_init(&txqi->cstats); __skb_queue_head_init(&txqi->frags); - INIT_LIST_HEAD(&txqi->schedule_order); + RB_CLEAR_NODE(&txqi->schedule_order); txqi->txq.vif = &sdata->vif; @@ -1491,9 +1524,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, ieee80211_purge_tx_queue(&local->hw, &txqi->frags); spin_unlock_bh(&fq->lock); - spin_lock_bh(&local->active_txq_lock[txqi->txq.ac]); - list_del_init(&txqi->schedule_order); - spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); + ieee80211_unschedule_txq(&local->hw, &txqi->txq, true); } void ieee80211_txq_set_params(struct ieee80211_local *local) @@ -1768,8 +1799,6 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); - if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) - CALL_TXH(ieee80211_tx_h_rate_ctrl); txh_done: if (unlikely(res == TX_DROP)) { @@ -1802,6 +1831,9 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) goto txh_done; } + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) + CALL_TXH(ieee80211_tx_h_rate_ctrl); + CALL_TXH(ieee80211_tx_h_michael_mic_add); CALL_TXH(ieee80211_tx_h_sequence); CALL_TXH(ieee80211_tx_h_fragment); @@ -3210,7 +3242,9 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; - if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr))) + if (!ieee80211_amsdu_realloc_pad(local, skb, + sizeof(*amsdu_hdr) + + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); @@ -3284,6 +3318,9 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) return false; + if (sdata->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED) + return false; + if (skb_is_gso(skb)) return false; @@ -3389,15 +3426,21 @@ out: * Can be called while the sta lock is held. Anything that can cause packets to * be generated will cause deadlock! */ -static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, u8 pn_offs, - struct ieee80211_key *key, - struct sk_buff *skb) +static ieee80211_tx_result +ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 pn_offs, + struct ieee80211_key *key, + struct ieee80211_tx_data *tx) { + struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; u8 tid = IEEE80211_NUM_TIDS; + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL) && + ieee80211_tx_h_rate_ctrl(tx) != TX_CONTINUE) + return TX_DROP; + if (key) info->control.hw_key = &key->conf; @@ -3446,6 +3489,8 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, break; } } + + return TX_CONTINUE; } static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, @@ -3549,24 +3594,17 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, tx.sta = sta; tx.key = fast_tx->key; - if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { - tx.skb = skb; - r = ieee80211_tx_h_rate_ctrl(&tx); - skb = tx.skb; - tx.skb = NULL; - - if (r != TX_CONTINUE) { - if (r != TX_QUEUED) - kfree_skb(skb); - return true; - } - } - if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; - ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, - fast_tx->key, skb); + tx.skb = skb; + r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, + fast_tx->key, &tx); + tx.skb = NULL; + if (r == TX_DROP) { + kfree_skb(skb); + return true; + } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -3671,8 +3709,16 @@ begin: else info->flags &= ~IEEE80211_TX_CTL_AMPDU; - if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) + if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { + if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { + r = ieee80211_tx_h_rate_ctrl(&tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + } goto encap_out; + } if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, @@ -3683,8 +3729,12 @@ begin: (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) pn_offs = ieee80211_hdrlen(hdr->frame_control); - ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, - tx.key, skb); + r = ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, + tx.key, &tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } } else { if (invoke_tx_handlers_late(&tx)) goto begin; @@ -3764,102 +3814,259 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue); struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct airtime_sched_info *air_sched; + u64 now = ktime_get_boottime_ns(); struct ieee80211_txq *ret = NULL; - struct txq_info *txqi = NULL, *head = NULL; - bool found_eligible_txq = false; + struct airtime_info *air_info; + struct txq_info *txqi = NULL; + struct rb_node *node; + bool first = false; - spin_lock_bh(&local->active_txq_lock[ac]); + air_sched = &local->airtime[ac]; + spin_lock_bh(&air_sched->lock); - begin: - txqi = list_first_entry_or_null(&local->active_txqs[ac], - struct txq_info, - schedule_order); - if (!txqi) + node = air_sched->schedule_pos; + +begin: + if (!node) { + node = rb_first_cached(&air_sched->active_txqs); + first = true; + } else { + node = rb_next(node); + } + + if (!node) goto out; - if (txqi == head) { - if (!found_eligible_txq) - goto out; - else - found_eligible_txq = false; + txqi = container_of(node, struct txq_info, schedule_order); + air_info = to_airtime_info(&txqi->txq); + + if (air_info->v_t > air_sched->v_t && + (!first || !airtime_catchup_v_t(air_sched, air_info->v_t, now))) + goto out; + + if (!ieee80211_txq_airtime_check(hw, &txqi->txq)) { + first = false; + goto begin; } - if (!head) - head = txqi; + air_sched->schedule_pos = node; + air_sched->last_schedule_activity = now; + ret = &txqi->txq; +out: + spin_unlock_bh(&air_sched->lock); + return ret; +} +EXPORT_SYMBOL(ieee80211_next_txq); - if (txqi->txq.sta) { - struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); - bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq); - s64 deficit = sta->airtime[txqi->txq.ac].deficit; +static void __ieee80211_insert_txq(struct rb_root_cached *root, + struct txq_info *txqi) +{ + struct rb_node **new = &root->rb_root.rb_node; + struct airtime_info *old_air, *new_air; + struct rb_node *parent = NULL; + struct txq_info *__txqi; + bool leftmost = true; + + while (*new) { + parent = *new; + __txqi = rb_entry(parent, struct txq_info, schedule_order); + old_air = to_airtime_info(&__txqi->txq); + new_air = to_airtime_info(&txqi->txq); + + if (new_air->v_t <= old_air->v_t) { + new = &parent->rb_left; + } else { + new = &parent->rb_right; + leftmost = false; + } + } - if (aql_check) - found_eligible_txq = true; + rb_link_node(&txqi->schedule_order, parent, new); + rb_insert_color_cached(&txqi->schedule_order, root, leftmost); +} - if (deficit < 0) - sta->airtime[txqi->txq.ac].deficit += - sta->airtime_weight; +void ieee80211_resort_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct airtime_info *air_info = to_airtime_info(txq); + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; - if (deficit < 0 || !aql_check) { - list_move_tail(&txqi->schedule_order, - &local->active_txqs[txqi->txq.ac]); - goto begin; + air_sched = &local->airtime[txq->ac]; + + lockdep_assert_held(&air_sched->lock); + + if (!RB_EMPTY_NODE(&txqi->schedule_order)) { + struct airtime_info *a_prev = NULL, *a_next = NULL; + struct txq_info *t_prev, *t_next; + struct rb_node *n_prev, *n_next; + + /* Erasing a node can cause an expensive rebalancing operation, + * so we check the previous and next nodes first and only remove + * and re-insert if the current node is not already in the + * correct position. + */ + if ((n_prev = rb_prev(&txqi->schedule_order)) != NULL) { + t_prev = container_of(n_prev, struct txq_info, + schedule_order); + a_prev = to_airtime_info(&t_prev->txq); } + + if ((n_next = rb_next(&txqi->schedule_order)) != NULL) { + t_next = container_of(n_next, struct txq_info, + schedule_order); + a_next = to_airtime_info(&t_next->txq); + } + + if ((!a_prev || a_prev->v_t <= air_info->v_t) && + (!a_next || a_next->v_t > air_info->v_t)) + return; + + if (air_sched->schedule_pos == &txqi->schedule_order) + air_sched->schedule_pos = n_prev; + + rb_erase_cached(&txqi->schedule_order, + &air_sched->active_txqs); + RB_CLEAR_NODE(&txqi->schedule_order); + __ieee80211_insert_txq(&air_sched->active_txqs, txqi); + } +} + +void ieee80211_update_airtime_weight(struct ieee80211_local *local, + struct airtime_sched_info *air_sched, + u64 now, bool force) +{ + struct airtime_info *air_info, *tmp; + u64 weight_sum = 0; + + if (unlikely(!now)) + now = ktime_get_boottime_ns(); + + lockdep_assert_held(&air_sched->lock); + + if (!force && (air_sched->last_weight_update < + now - AIRTIME_ACTIVE_DURATION)) + return; + + list_for_each_entry_safe(air_info, tmp, + &air_sched->active_list, list) { + if (airtime_is_active(air_info, now)) + weight_sum += air_info->weight; + else + list_del_init(&air_info->list); } + airtime_weight_sum_set(air_sched, weight_sum); + air_sched->last_weight_update = now; +} + +void ieee80211_schedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) + __acquires(txq_lock) __releases(txq_lock) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; + u64 now = ktime_get_boottime_ns(); + struct airtime_info *air_info; + u8 ac = txq->ac; + bool was_active; + + air_sched = &local->airtime[ac]; + air_info = to_airtime_info(txq); + spin_lock_bh(&air_sched->lock); + was_active = airtime_is_active(air_info, now); + airtime_set_active(air_sched, air_info, now); - if (txqi->schedule_round == local->schedule_round[ac]) + if (!RB_EMPTY_NODE(&txqi->schedule_order)) goto out; - list_del_init(&txqi->schedule_order); - txqi->schedule_round = local->schedule_round[ac]; - ret = &txqi->txq; + /* If the station has been inactive for a while, catch up its v_t so it + * doesn't get indefinite priority; see comment above the definition of + * AIRTIME_MAX_BEHIND. + */ + if ((!was_active && air_info->v_t < air_sched->v_t) || + air_info->v_t < air_sched->v_t - AIRTIME_MAX_BEHIND) + air_info->v_t = air_sched->v_t; + + ieee80211_update_airtime_weight(local, air_sched, now, !was_active); + __ieee80211_insert_txq(&air_sched->active_txqs, txqi); out: - spin_unlock_bh(&local->active_txq_lock[ac]); - return ret; + spin_unlock_bh(&air_sched->lock); } -EXPORT_SYMBOL(ieee80211_next_txq); +EXPORT_SYMBOL(ieee80211_schedule_txq); -void __ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq, - bool force) +static void __ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; - spin_lock_bh(&local->active_txq_lock[txq->ac]); - - if (list_empty(&txqi->schedule_order) && - (force || !skb_queue_empty(&txqi->frags) || - txqi->tin.backlog_packets)) { - /* If airtime accounting is active, always enqueue STAs at the - * head of the list to ensure that they only get moved to the - * back by the airtime DRR scheduler once they have a negative - * deficit. A station that already has a negative deficit will - * get immediately moved to the back of the list on the next - * call to ieee80211_next_txq(). - */ - if (txqi->txq.sta && local->airtime_flags && - wiphy_ext_feature_isset(local->hw.wiphy, - NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) - list_add(&txqi->schedule_order, - &local->active_txqs[txq->ac]); - else - list_add_tail(&txqi->schedule_order, - &local->active_txqs[txq->ac]); + air_sched = &local->airtime[txq->ac]; + air_info = to_airtime_info(&txqi->txq); + + lockdep_assert_held(&air_sched->lock); + + if (purge) { + list_del_init(&air_info->list); + ieee80211_update_airtime_weight(local, air_sched, 0, true); } - spin_unlock_bh(&local->active_txq_lock[txq->ac]); + if (RB_EMPTY_NODE(&txqi->schedule_order)) + return; + + if (air_sched->schedule_pos == &txqi->schedule_order) + air_sched->schedule_pos = rb_prev(&txqi->schedule_order); + + if (!purge) + airtime_set_active(air_sched, air_info, + ktime_get_boottime_ns()); + + rb_erase_cached(&txqi->schedule_order, + &air_sched->active_txqs); + RB_CLEAR_NODE(&txqi->schedule_order); } -EXPORT_SYMBOL(__ieee80211_schedule_txq); + +void ieee80211_unschedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool purge) + __acquires(txq_lock) __releases(txq_lock) +{ + struct ieee80211_local *local = hw_to_local(hw); + + spin_lock_bh(&local->airtime[txq->ac].lock); + __ieee80211_unschedule_txq(hw, txq, purge); + spin_unlock_bh(&local->airtime[txq->ac].lock); +} + +void ieee80211_return_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, bool force) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = to_txq_info(txq); + + spin_lock_bh(&local->airtime[txq->ac].lock); + + if (!RB_EMPTY_NODE(&txqi->schedule_order) && !force && + !txq_has_queue(txq)) + __ieee80211_unschedule_txq(hw, txq, false); + + spin_unlock_bh(&local->airtime[txq->ac].lock); +} +EXPORT_SYMBOL(ieee80211_return_txq); DEFINE_STATIC_KEY_FALSE(aql_disable); bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { - struct sta_info *sta; + struct airtime_info *air_info = to_airtime_info(txq); struct ieee80211_local *local = hw_to_local(hw); if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) @@ -3874,15 +4081,12 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) return true; - sta = container_of(txq->sta, struct sta_info, sta); - if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < - sta->airtime[txq->ac].aql_limit_low) + if (atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_low) return true; if (atomic_read(&local->aql_total_pending_airtime) < local->aql_threshold && - atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < - sta->airtime[txq->ac].aql_limit_high) + atomic_read(&air_info->aql_tx_pending) < air_info->aql_limit_high) return true; return false; @@ -3892,60 +4096,59 @@ EXPORT_SYMBOL(ieee80211_txq_airtime_check); bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { + struct txq_info *first_txqi = NULL, *txqi = to_txq_info(txq); struct ieee80211_local *local = hw_to_local(hw); - struct txq_info *iter, *tmp, *txqi = to_txq_info(txq); - struct sta_info *sta; - u8 ac = txq->ac; + struct airtime_sched_info *air_sched; + struct airtime_info *air_info; + struct rb_node *node = NULL; + bool ret = false; + u64 now; - spin_lock_bh(&local->active_txq_lock[ac]); - if (!txqi->txq.sta) - goto out; + if (!ieee80211_txq_airtime_check(hw, txq)) + return false; - if (list_empty(&txqi->schedule_order)) + air_sched = &local->airtime[txq->ac]; + spin_lock_bh(&air_sched->lock); + + if (RB_EMPTY_NODE(&txqi->schedule_order)) goto out; - list_for_each_entry_safe(iter, tmp, &local->active_txqs[ac], - schedule_order) { - if (iter == txqi) - break; + now = ktime_get_boottime_ns(); - if (!iter->txq.sta) { - list_move_tail(&iter->schedule_order, - &local->active_txqs[ac]); - continue; - } - sta = container_of(iter->txq.sta, struct sta_info, sta); - if (sta->airtime[ac].deficit < 0) - sta->airtime[ac].deficit += sta->airtime_weight; - list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); - } + /* Like in ieee80211_next_txq(), make sure the first station in the + * scheduling order is eligible for transmission to avoid starvation. + */ + node = rb_first_cached(&air_sched->active_txqs); + if (node) { + first_txqi = container_of(node, struct txq_info, + schedule_order); + air_info = to_airtime_info(&first_txqi->txq); - sta = container_of(txqi->txq.sta, struct sta_info, sta); - if (sta->airtime[ac].deficit >= 0) - goto out; + if (air_sched->v_t < air_info->v_t) + airtime_catchup_v_t(air_sched, air_info->v_t, now); + } - sta->airtime[ac].deficit += sta->airtime_weight; - list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); - spin_unlock_bh(&local->active_txq_lock[ac]); + air_info = to_airtime_info(&txqi->txq); + if (air_info->v_t <= air_sched->v_t) { + air_sched->last_schedule_activity = now; + ret = true; + } - return false; out: - if (!list_empty(&txqi->schedule_order)) - list_del_init(&txqi->schedule_order); - spin_unlock_bh(&local->active_txq_lock[ac]); - - return true; + spin_unlock_bh(&air_sched->lock); + return ret; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct airtime_sched_info *air_sched = &local->airtime[ac]; - spin_lock_bh(&local->active_txq_lock[ac]); - local->schedule_round[ac]++; - spin_unlock_bh(&local->active_txq_lock[ac]); + spin_lock_bh(&air_sched->lock); + air_sched->schedule_pos = NULL; + spin_unlock_bh(&air_sched->lock); } EXPORT_SYMBOL(ieee80211_txq_schedule_start); @@ -3979,6 +4182,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_get_hash(skb); } + ieee80211_aggr_check(sdata, sta, skb); + if (sta) { struct ieee80211_fast_tx *fast_tx; @@ -4242,6 +4447,8 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memset(info, 0, sizeof(*info)); + ieee80211_aggr_check(sdata, sta, skb); + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { @@ -4577,11 +4784,11 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { + u8 *beacon_data, count, max_count = 1; struct probe_resp *resp; - u8 *beacon_data; size_t beacon_data_len; + u16 *bcn_offsets; int i; - u8 count = beacon->cntdwn_current_counter; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: @@ -4601,21 +4808,27 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, } rcu_read_lock(); - for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) { - resp = rcu_dereference(sdata->u.ap.probe_resp); + resp = rcu_dereference(sdata->u.ap.probe_resp); - if (beacon->cntdwn_counter_offsets[i]) { - if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >= - beacon_data_len)) { + bcn_offsets = beacon->cntdwn_counter_offsets; + count = beacon->cntdwn_current_counter; + if (sdata->vif.csa_active) + max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; + + for (i = 0; i < max_count; ++i) { + if (bcn_offsets[i]) { + if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) { rcu_read_unlock(); return; } - - beacon_data[beacon->cntdwn_counter_offsets[i]] = count; + beacon_data[bcn_offsets[i]] = count; } - if (sdata->vif.type == NL80211_IFTYPE_AP && resp) - resp->data[resp->cntdwn_counter_offsets[i]] = count; + if (sdata->vif.type == NL80211_IFTYPE_AP && resp) { + u16 *resp_offsets = resp->cntdwn_counter_offsets; + + resp->data[resp_offsets[i]] = count; + } } rcu_read_unlock(); } @@ -4825,6 +5038,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (offs) { offs->tim_offset = beacon->head_len; offs->tim_length = skb->len - beacon->head_len; + offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; /* for AP the csa offsets are from tail */ csa_off_base = skb->len; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 060059ef9668..49cb96d25169 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * utilities for mac80211 */ @@ -1336,6 +1336,18 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->rsnx = pos; elems->rsnx_len = elen; break; + case WLAN_EID_TX_POWER_ENVELOPE: + if (elen < 1 || + elen > sizeof(struct ieee80211_tx_pwr_env)) + break; + + if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) + break; + + elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; + elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; + elems->tx_pwr_env_num++; + break; case WLAN_EID_EXTENSION: ieee80211_parse_extension_element(calc_crc ? &crc : NULL, @@ -1693,7 +1705,10 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); - WARN_ON(err); + if (WARN_ON(err)) { + kfree_skb(skb); + return; + } } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | @@ -1934,13 +1949,26 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, *offset = noffset; } - he_cap = ieee80211_get_he_sta_cap(sband); - if (he_cap) { + he_cap = ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); + if (he_cap && + cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + IEEE80211_CHAN_NO_HE)) { pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; + } + + if (cfg80211_any_usable_channels(local->hw.wiphy, + BIT(NL80211_BAND_6GHZ), + IEEE80211_CHAN_NO_HE)) { + struct ieee80211_supported_band *sband6; + + sband6 = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + he_cap = ieee80211_get_he_iftype_cap(sband6, + ieee80211_vif_type_p2p(&sdata->vif)); - if (sband->band == NL80211_BAND_6GHZ) { + if (he_cap) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); @@ -2944,12 +2972,15 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, u8 *pos; u16 cap; - sband = ieee80211_get_sband(sdata); - if (!sband) + if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, + BIT(NL80211_BAND_6GHZ), + IEEE80211_CHAN_NO_HE)) return; + sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + iftd = ieee80211_get_sband_iftype_data(sband, iftype); - if (WARN_ON(!iftd)) + if (!iftd) return; /* Check for device HE 6 GHz capability before adding element */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 1cf5ac09edcb..323d3d2d986f 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -617,7 +617,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, { struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; - int ret = -ENOMEM; + int ret; ASSERT_RTNL(); diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig new file mode 100644 index 000000000000..2cdf3d0a28c9 --- /dev/null +++ b/net/mctp/Kconfig @@ -0,0 +1,13 @@ + +menuconfig MCTP + depends on NET + tristate "MCTP core protocol support" + help + Management Component Transport Protocol (MCTP) is an in-system + protocol for communicating between management controllers and + their managed devices (peripherals, host processors, etc.). The + protocol is defined by DMTF specification DSP0236. + + This option enables core MCTP support. For communicating with other + devices, you'll want to enable a driver for a specific hardware + channel. diff --git a/net/mctp/Makefile b/net/mctp/Makefile new file mode 100644 index 000000000000..0171333384d7 --- /dev/null +++ b/net/mctp/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MCTP) += mctp.o +mctp-objs := af_mctp.o device.o route.o neigh.o diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c new file mode 100644 index 000000000000..a9526ac29dff --- /dev/null +++ b/net/mctp/af_mctp.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_arp.h> +#include <linux/net.h> +#include <linux/mctp.h> +#include <linux/module.h> +#include <linux/socket.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +/* socket implementation */ + +static int mctp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + sk->sk_prot->close(sk, 0); + } + + return 0; +} + +static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +{ + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct sockaddr_mctp *smctp; + int rc; + + if (addrlen < sizeof(*smctp)) + return -EINVAL; + + if (addr->sa_family != AF_MCTP) + return -EAFNOSUPPORT; + + if (!capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + /* it's a valid sockaddr for MCTP, cast and do protocol checks */ + smctp = (struct sockaddr_mctp *)addr; + + lock_sock(sk); + + /* TODO: allow rebind */ + if (sk_hashed(sk)) { + rc = -EADDRINUSE; + goto out_release; + } + msk->bind_net = smctp->smctp_network; + msk->bind_addr = smctp->smctp_addr.s_addr; + msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */ + + rc = sk->sk_prot->hash(sk); + +out_release: + release_sock(sk); + + return rc; +} + +static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr); + int rc, addrlen = msg->msg_namelen; + struct sock *sk = sock->sk; + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct sk_buff *skb; + + if (addr) { + if (addrlen < sizeof(struct sockaddr_mctp)) + return -EINVAL; + if (addr->smctp_family != AF_MCTP) + return -EINVAL; + if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) + return -EINVAL; + + } else { + /* TODO: connect()ed sockets */ + return -EDESTADDRREQ; + } + + if (!capable(CAP_NET_RAW)) + return -EACCES; + + if (addr->smctp_network == MCTP_NET_ANY) + addr->smctp_network = mctp_default_net(sock_net(sk)); + + rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr); + if (!rt) + return -EHOSTUNREACH; + + skb = sock_alloc_send_skb(sk, hlen + 1 + len, + msg->msg_flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + skb_reserve(skb, hlen); + + /* set type as fist byte in payload */ + *(u8 *)skb_put(skb, 1) = addr->smctp_type; + + rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); + if (rc < 0) { + kfree_skb(skb); + return rc; + } + + /* set up cb */ + cb = __mctp_cb(skb); + cb->net = addr->smctp_network; + + rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, + addr->smctp_tag); + + return rc ? : len; +} + +static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t msglen; + u8 type; + int rc; + + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) + return -EOPNOTSUPP; + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + if (!skb->len) { + rc = 0; + goto out_free; + } + + /* extract message type, remove from data */ + type = *((u8 *)skb->data); + msglen = skb->len - 1; + + if (len < msglen) + msg->msg_flags |= MSG_TRUNC; + else + len = msglen; + + rc = skb_copy_datagram_msg(skb, 1, msg, len); + if (rc < 0) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (addr) { + struct mctp_skb_cb *cb = mctp_cb(skb); + /* TODO: expand mctp_skb_cb for header fields? */ + struct mctp_hdr *hdr = mctp_hdr(skb); + + addr = msg->msg_name; + addr->smctp_family = AF_MCTP; + addr->smctp_network = cb->net; + addr->smctp_addr.s_addr = hdr->src; + addr->smctp_type = type; + addr->smctp_tag = hdr->flags_seq_tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + msg->msg_namelen = sizeof(*addr); + } + + rc = len; + + if (flags & MSG_TRUNC) + rc = msglen; + +out_free: + skb_free_datagram(sk, skb); + return rc; +} + +static int mctp_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return -EINVAL; +} + +static int mctp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EINVAL; +} + +static const struct proto_ops mctp_dgram_ops = { + .family = PF_MCTP, + .release = mctp_release, + .bind = mctp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = mctp_setsockopt, + .getsockopt = mctp_getsockopt, + .sendmsg = mctp_sendmsg, + .recvmsg = mctp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int mctp_sk_init(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + INIT_HLIST_HEAD(&msk->keys); + return 0; +} + +static void mctp_sk_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +static int mctp_sk_hash(struct sock *sk) +{ + struct net *net = sock_net(sk); + + mutex_lock(&net->mctp.bind_lock); + sk_add_node_rcu(sk, &net->mctp.binds); + mutex_unlock(&net->mctp.bind_lock); + + return 0; +} + +static void mctp_sk_unhash(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(sk); + struct mctp_sk_key *key; + struct hlist_node *tmp; + unsigned long flags; + + /* remove from any type-based binds */ + mutex_lock(&net->mctp.bind_lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->mctp.bind_lock); + + /* remove tag allocations */ + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + hlist_del_rcu(&key->sklist); + hlist_del_rcu(&key->hlist); + + spin_lock(&key->reasm_lock); + if (key->reasm_head) + kfree_skb(key->reasm_head); + key->reasm_head = NULL; + key->reasm_dead = true; + spin_unlock(&key->reasm_lock); + + kfree_rcu(key, rcu); + } + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + synchronize_rcu(); +} + +static struct proto mctp_proto = { + .name = "MCTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct mctp_sock), + .init = mctp_sk_init, + .close = mctp_sk_close, + .hash = mctp_sk_hash, + .unhash = mctp_sk_unhash, +}; + +static int mctp_pf_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + const struct proto_ops *ops; + struct proto *proto; + struct sock *sk; + int rc; + + if (protocol) + return -EPROTONOSUPPORT; + + /* only datagram sockets are supported */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + proto = &mctp_proto; + ops = &mctp_dgram_ops; + + sock->state = SS_UNCONNECTED; + sock->ops = ops; + + sk = sk_alloc(net, PF_MCTP, GFP_KERNEL, proto, kern); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + rc = 0; + if (sk->sk_prot->init) + rc = sk->sk_prot->init(sk); + + if (rc) + goto err_sk_put; + + return 0; + +err_sk_put: + sock_orphan(sk); + sock_put(sk); + return rc; +} + +static struct net_proto_family mctp_pf = { + .family = PF_MCTP, + .create = mctp_pf_create, + .owner = THIS_MODULE, +}; + +static __init int mctp_init(void) +{ + int rc; + + /* ensure our uapi tag definitions match the header format */ + BUILD_BUG_ON(MCTP_TAG_OWNER != MCTP_HDR_FLAG_TO); + BUILD_BUG_ON(MCTP_TAG_MASK != MCTP_HDR_TAG_MASK); + + pr_info("mctp: management component transport protocol core\n"); + + rc = sock_register(&mctp_pf); + if (rc) + return rc; + + rc = proto_register(&mctp_proto, 0); + if (rc) + goto err_unreg_sock; + + rc = mctp_routes_init(); + if (rc) + goto err_unreg_proto; + + rc = mctp_neigh_init(); + if (rc) + goto err_unreg_proto; + + mctp_device_init(); + + return 0; + +err_unreg_proto: + proto_unregister(&mctp_proto); +err_unreg_sock: + sock_unregister(PF_MCTP); + + return rc; +} + +static __exit void mctp_exit(void) +{ + mctp_device_exit(); + mctp_neigh_exit(); + mctp_routes_exit(); + proto_unregister(&mctp_proto); + sock_unregister(PF_MCTP); +} + +module_init(mctp_init); +module_exit(mctp_exit); + +MODULE_DESCRIPTION("MCTP core"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>"); + +MODULE_ALIAS_NETPROTO(PF_MCTP); diff --git a/net/mctp/device.c b/net/mctp/device.c new file mode 100644 index 000000000000..b9f38e765f61 --- /dev/null +++ b/net/mctp/device.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - device implementation. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_link.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> + +#include <net/addrconf.h> +#include <net/netlink.h> +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +struct mctp_dump_cb { + int h; + int idx; + size_t a_idx; +}; + +/* unlocked: caller must hold rcu_read_lock */ +struct mctp_dev *__mctp_dev_get(const struct net_device *dev) +{ + return rcu_dereference(dev->mctp_ptr); +} + +struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) +{ + return rtnl_dereference(dev->mctp_ptr); +} + +static void mctp_dev_destroy(struct mctp_dev *mdev) +{ + struct net_device *dev = mdev->dev; + + dev_put(dev); + kfree_rcu(mdev, rcu); +} + +static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, + struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct ifaddrmsg *hdr; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifa_family = AF_MCTP; + hdr->ifa_prefixlen = 0; + hdr->ifa_flags = 0; + hdr->ifa_scope = 0; + hdr->ifa_index = mdev->dev->ifindex; + + if (nla_put_u8(skb, IFA_LOCAL, eid)) + goto cancel; + + if (nla_put_u8(skb, IFA_ADDRESS, eid)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + int rc = 0; + + for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) { + rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]); + if (rc < 0) + break; + } + + return rc; +} + +static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + struct net_device *dev; + struct ifaddrmsg *hdr; + struct mctp_dev *mdev; + int ifindex; + int idx, rc; + + hdr = nlmsg_data(cb->nlh); + // filter by ifindex if requested + ifindex = hdr->ifa_index; + + rcu_read_lock(); + for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { + idx = 0; + head = &net->dev_index_head[mcb->h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx >= mcb->idx && + (ifindex == 0 || ifindex == dev->ifindex)) { + mdev = __mctp_dev_get(dev); + if (mdev) { + rc = mctp_dump_dev_addrinfo(mdev, + skb, cb); + // Error indicates full buffer, this + // callback will get retried. + if (rc < 0) + goto out; + } + } + idx++; + // reset for next iteration + mcb->a_idx = 0; + } + } +out: + rcu_read_unlock(); + mcb->idx = idx; + + return skb->len; +} + +static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = { + [IFA_ADDRESS] = { .type = NLA_U8 }, + [IFA_LOCAL] = { .type = NLA_U8 }, +}; + +static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *tmp_addrs; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (!mctp_address_ok(addr->s_addr)) + return -EINVAL; + + /* Prevent duplicates. Under RTNL so don't need to lock for reading */ + if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs)) + return -EEXIST; + + tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL); + if (!tmp_addrs) + return -ENOMEM; + memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs); + tmp_addrs[mdev->num_addrs] = addr->s_addr; + + /* Lock to write */ + spin_lock_irqsave(&mdev->addrs_lock, flags); + mdev->num_addrs++; + swap(mdev->addrs, tmp_addrs); + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + kfree(tmp_addrs); + + mctp_route_add_local(mdev, addr->s_addr); + + return 0; +} + +static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *pos; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs); + if (!pos) + return -ENOENT; + + rc = mctp_route_remove_local(mdev, addr->s_addr); + // we can ignore -ENOENT in the case a route was already removed + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&mdev->addrs_lock, flags); + memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs)); + mdev->num_addrs--; + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + return 0; +} + +static struct mctp_dev *mctp_add_dev(struct net_device *dev) +{ + struct mctp_dev *mdev; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&mdev->addrs_lock); + + mdev->net = mctp_default_net(dev_net(dev)); + + /* associate to net_device */ + rcu_assign_pointer(dev->mctp_ptr, mdev); + dev_hold(dev); + mdev->dev = dev; + + return mdev; +} + +static int mctp_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODATA; + if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) + return -EMSGSIZE; + return 0; +} + +static size_t mctp_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + unsigned int ret; + + /* caller holds RCU */ + mdev = __mctp_dev_get(dev); + if (!mdev) + return 0; + ret = nla_total_size(4); /* IFLA_MCTP_NET */ + return ret; +} + +static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = { + [IFLA_MCTP_NET] = { .type = NLA_U32 }, +}; + +static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MCTP_MAX + 1]; + struct mctp_dev *mdev; + int rc; + + rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy, + NULL); + if (rc) + return rc; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return 0; + + if (tb[IFLA_MCTP_NET]) + WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET])); + + return 0; +} + +static void mctp_unregister(struct net_device *dev) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + + if (!mdev) + return; + + RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL); + + mctp_route_remove_dev(mdev); + mctp_neigh_remove_dev(mdev); + kfree(mdev->addrs); + + mctp_dev_destroy(mdev); +} + +static int mctp_register(struct net_device *dev) +{ + struct mctp_dev *mdev; + + /* Already registered? */ + if (rtnl_dereference(dev->mctp_ptr)) + return 0; + + /* only register specific types; MCTP-specific and loopback for now */ + if (dev->type != ARPHRD_MCTP && dev->type != ARPHRD_LOOPBACK) + return 0; + + mdev = mctp_add_dev(dev); + if (IS_ERR(mdev)) + return PTR_ERR(mdev); + + return 0; +} + +static int mctp_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int rc; + + switch (event) { + case NETDEV_REGISTER: + rc = mctp_register(dev); + if (rc) + return notifier_from_errno(rc); + break; + case NETDEV_UNREGISTER: + mctp_unregister(dev); + break; + } + + return NOTIFY_OK; +} + +static struct rtnl_af_ops mctp_af_ops = { + .family = AF_MCTP, + .fill_link_af = mctp_fill_link_af, + .get_link_af_size = mctp_get_link_af_size, + .set_link_af = mctp_set_link_af, +}; + +static struct notifier_block mctp_dev_nb = { + .notifier_call = mctp_dev_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, +}; + +void __init mctp_device_init(void) +{ + register_netdevice_notifier(&mctp_dev_nb); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR, + NULL, mctp_dump_addrinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR, + mctp_rtm_newaddr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR, + mctp_rtm_deladdr, NULL, 0); + rtnl_af_register(&mctp_af_ops); +} + +void __exit mctp_device_exit(void) +{ + rtnl_af_unregister(&mctp_af_ops); + rtnl_unregister(PF_MCTP, RTM_DELADDR); + rtnl_unregister(PF_MCTP, RTM_NEWADDR); + rtnl_unregister(PF_MCTP, RTM_GETADDR); + + unregister_netdevice_notifier(&mctp_dev_nb); +} diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c new file mode 100644 index 000000000000..90ed2f02d1fb --- /dev/null +++ b/net/mctp/neigh.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, + enum mctp_neigh_source source, + size_t lladdr_len, const void *lladdr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc; + + mutex_lock(&net->mctp.neigh_lock); + if (mctp_neigh_lookup(mdev, eid, NULL) == 0) { + rc = -EEXIST; + goto out; + } + + if (lladdr_len > sizeof(neigh->ha)) { + rc = -EINVAL; + goto out; + } + + neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); + if (!neigh) { + rc = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&neigh->list); + neigh->dev = mdev; + dev_hold(neigh->dev->dev); + neigh->eid = eid; + neigh->source = source; + memcpy(neigh->ha, lladdr, lladdr_len); + + list_add_rcu(&neigh->list, &net->mctp.neighbours); + rc = 0; +out: + mutex_unlock(&net->mctp.neigh_lock); + return rc; +} + +static void __mctp_neigh_free(struct rcu_head *rcu) +{ + struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); + + dev_put(neigh->dev->dev); + kfree(neigh); +} + +/* Removes all neighbour entries referring to a device */ +void mctp_neigh_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + } + } + + mutex_unlock(&net->mctp.neigh_lock); +} + +// TODO: add a "source" flag so netlink can only delete static neighbours? +static int mctp_neigh_remove(struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + bool dropped = false; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev && neigh->eid == eid) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + dropped = true; + } + } + + mutex_unlock(&net->mctp.neigh_lock); + return dropped ? 0 : -ENOENT; +} + +static const struct nla_policy nd_mctp_policy[NDA_MAX + 1] = { + [NDA_DST] = { .type = NLA_U8 }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, +}; + +static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX + 1]; + int rc; + mctp_eid_t eid; + void *lladdr; + int lladdr_len; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "lladdr too large?"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + + if (!tb[NDA_LLADDR]) { + NL_SET_ERR_MSG(extack, "Neighbour lladdr must be specified"); + return -EINVAL; + } + + eid = nla_get_u8(tb[NDA_DST]); + if (!mctp_address_ok(eid)) { + NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); + return -EINVAL; + } + + lladdr = nla_data(tb[NDA_LLADDR]); + lladdr_len = nla_len(tb[NDA_LLADDR]); + + ndm = nlmsg_data(nlh); + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (lladdr_len != dev->addr_len) { + NL_SET_ERR_MSG(extack, "Wrong lladdr length"); + return -EINVAL; + } + + return mctp_neigh_add(mdev, eid, MCTP_NEIGH_STATIC, + lladdr_len, lladdr); +} + +static int mctp_rtm_delneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + int rc; + mctp_eid_t eid; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + eid = nla_get_u8(tb[NDA_DST]); + + ndm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + return mctp_neigh_remove(mdev, eid); +} + +static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, + unsigned int flags, struct mctp_neigh *neigh) +{ + struct net_device *dev = neigh->dev->dev; + struct nlmsghdr *nlh; + struct ndmsg *hdr; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ndm_family = AF_MCTP; + hdr->ndm_ifindex = dev->ifindex; + hdr->ndm_state = 0; // TODO other state bits? + if (neigh->source == MCTP_NEIGH_STATIC) + hdr->ndm_state |= NUD_PERMANENT; + hdr->ndm_flags = 0; + hdr->ndm_type = RTN_UNICAST; // TODO: is loopback RTN_LOCAL? + + if (nla_put_u8(skb, NDA_DST, neigh->eid)) + goto cancel; + + if (nla_put(skb, NDA_LLADDR, dev->addr_len, neigh->ha)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int rc, idx, req_ifindex; + struct mctp_neigh *neigh; + struct ndmsg *ndmsg; + struct { + int idx; + } *cbctx = (void *)cb->ctx; + + ndmsg = nlmsg_data(cb->nlh); + req_ifindex = ndmsg->ndm_ifindex; + + idx = 0; + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (idx < cbctx->idx) + goto cont; + + rc = 0; + if (req_ifindex == 0 || req_ifindex == neigh->dev->dev->ifindex) + rc = mctp_fill_neigh(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, neigh); + + if (rc) + break; +cont: + idx++; + } + rcu_read_unlock(); + + cbctx->idx = idx; + return skb->len; +} + +int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc = -EHOSTUNREACH; // TODO: or ENOENT? + + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (mdev == neigh->dev && eid == neigh->eid) { + if (ret_hwaddr) + memcpy(ret_hwaddr, neigh->ha, + sizeof(neigh->ha)); + rc = 0; + break; + } + } + rcu_read_unlock(); + return rc; +} + +/* namespace registration */ +static int __net_init mctp_neigh_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->neighbours); + mutex_init(&ns->neigh_lock); + return 0; +} + +static void __net_exit mctp_neigh_net_exit(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + struct mctp_neigh *neigh; + + list_for_each_entry(neigh, &ns->neighbours, list) + call_rcu(&neigh->rcu, __mctp_neigh_free); +} + +/* net namespace implementation */ + +static struct pernet_operations mctp_net_ops = { + .init = mctp_neigh_net_init, + .exit = mctp_neigh_net_exit, +}; + +int __init mctp_neigh_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, + mctp_rtm_newneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELNEIGH, + mctp_rtm_delneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETNEIGH, + NULL, mctp_rtm_getneigh, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_neigh_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_GETNEIGH); + rtnl_unregister(PF_MCTP, RTM_DELNEIGH); + rtnl_unregister(PF_MCTP, RTM_NEWNEIGH); +} diff --git a/net/mctp/route.c b/net/mctp/route.c new file mode 100644 index 000000000000..5265525011ad --- /dev/null +++ b/net/mctp/route.c @@ -0,0 +1,1116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <uapi/linux/if_arp.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +static const unsigned int mctp_message_maxlen = 64 * 1024; + +/* route output callbacks */ +static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *mh; + struct sock *sk; + u8 type; + + WARN_ON(!rcu_read_lock_held()); + + /* TODO: look up in skb->cb? */ + mh = mctp_hdr(skb); + + if (!skb_headlen(skb)) + return NULL; + + type = (*(u8 *)skb->data) & 0x7f; + + sk_for_each_rcu(sk, &net->mctp.binds) { + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net) + continue; + + if (msk->bind_type != type) + continue; + + if (msk->bind_addr != MCTP_ADDR_ANY && + msk->bind_addr != mh->dest) + continue; + + return msk; + } + + return NULL; +} + +static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, + mctp_eid_t peer, u8 tag) +{ + if (key->local_addr != local) + return false; + + if (key->peer_addr != peer) + return false; + + if (key->tag != tag) + return false; + + return true; +} + +static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, + mctp_eid_t peer) +{ + struct mctp_sk_key *key, *ret; + struct mctp_hdr *mh; + u8 tag; + + WARN_ON(!rcu_read_lock_held()); + + mh = mctp_hdr(skb); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + ret = NULL; + + hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) { + if (mctp_key_match(key, mh->dest, peer, tag)) { + ret = key; + break; + } + } + + return ret; +} + +static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, + mctp_eid_t local, mctp_eid_t peer, + u8 tag, gfp_t gfp) +{ + struct mctp_sk_key *key; + + key = kzalloc(sizeof(*key), gfp); + if (!key) + return NULL; + + key->peer_addr = peer; + key->local_addr = local; + key->tag = tag; + key->sk = &msk->sk; + spin_lock_init(&key->reasm_lock); + + return key; +} + +static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) +{ + struct net *net = sock_net(&msk->sk); + struct mctp_sk_key *tmp; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { + if (mctp_key_match(tmp, key->local_addr, key->peer_addr, + key->tag)) { + rc = -EEXIST; + break; + } + } + + if (!rc) { + hlist_add_head(&key->hlist, &net->mctp.keys); + hlist_add_head(&key->sklist, &msk->keys); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + return rc; +} + +/* Must be called with key->reasm_lock, which it will release. Will schedule + * the key for an RCU free. + */ +static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, + unsigned long flags) + __releases(&key->reasm_lock) +{ + struct sk_buff *skb; + + skb = key->reasm_head; + key->reasm_head = NULL; + key->reasm_dead = true; + spin_unlock_irqrestore(&key->reasm_lock, flags); + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_del_rcu(&key->hlist); + hlist_del_rcu(&key->sklist); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + kfree_rcu(key, rcu); + + if (skb) + kfree_skb(skb); +} + +static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) +{ + struct mctp_hdr *hdr = mctp_hdr(skb); + u8 exp_seq, this_seq; + + this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) + & MCTP_HDR_SEQ_MASK; + + if (!key->reasm_head) { + key->reasm_head = skb; + key->reasm_tailp = &(skb_shinfo(skb)->frag_list); + key->last_seq = this_seq; + return 0; + } + + exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; + + if (this_seq != exp_seq) + return -EINVAL; + + if (key->reasm_head->len + skb->len > mctp_message_maxlen) + return -EINVAL; + + skb->next = NULL; + skb->sk = NULL; + *key->reasm_tailp = skb; + key->reasm_tailp = &skb->next; + + key->last_seq = this_seq; + + key->reasm_head->data_len += skb->len; + key->reasm_head->len += skb->len; + key->reasm_head->truesize += skb->truesize; + + return 0; +} + +static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct mctp_sk_key *key; + struct mctp_sock *msk; + struct mctp_hdr *mh; + unsigned long f; + u8 tag, flags; + int rc; + + msk = NULL; + rc = -EINVAL; + + /* we may be receiving a locally-routed packet; drop source sk + * accounting + */ + skb_orphan(skb); + + /* ensure we have enough data for a header and a type */ + if (skb->len < sizeof(struct mctp_hdr) + 1) + goto out; + + /* grab header, advance data ptr */ + mh = mctp_hdr(skb); + skb_pull(skb, sizeof(struct mctp_hdr)); + + if (mh->ver != 1) + goto out; + + flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + rcu_read_lock(); + + /* lookup socket / reasm context, exactly matching (src,dest,tag) */ + key = mctp_lookup_key(net, skb, mh->src); + + if (flags & MCTP_HDR_FLAG_SOM) { + if (key) { + msk = container_of(key->sk, struct mctp_sock, sk); + } else { + /* first response to a broadcast? do a more general + * key lookup to find the socket, but don't use this + * key for reassembly - we'll create a more specific + * one for future packets if required (ie, !EOM). + */ + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); + if (key) { + msk = container_of(key->sk, + struct mctp_sock, sk); + key = NULL; + } + } + + if (!key && !msk && (tag & MCTP_HDR_FLAG_TO)) + msk = mctp_lookup_bind(net, skb); + + if (!msk) { + rc = -ENOENT; + goto out_unlock; + } + + /* single-packet message? deliver to socket, clean up any + * pending key. + */ + if (flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(&msk->sk, skb); + if (key) { + spin_lock_irqsave(&key->reasm_lock, f); + /* we've hit a pending reassembly; not much we + * can do but drop it + */ + __mctp_key_unlock_drop(key, net, f); + } + rc = 0; + goto out_unlock; + } + + /* broadcast response or a bind() - create a key for further + * packets for this message + */ + if (!key) { + key = mctp_key_alloc(msk, mh->dest, mh->src, + tag, GFP_ATOMIC); + if (!key) { + rc = -ENOMEM; + goto out_unlock; + } + + /* we can queue without the reasm lock here, as the + * key isn't observable yet + */ + mctp_frag_queue(key, skb); + + /* if the key_add fails, we've raced with another + * SOM packet with the same src, dest and tag. There's + * no way to distinguish future packets, so all we + * can do is drop; we'll free the skb on exit from + * this function. + */ + rc = mctp_key_add(key, msk); + if (rc) + kfree(key); + + } else { + /* existing key: start reassembly */ + spin_lock_irqsave(&key->reasm_lock, f); + + if (key->reasm_head || key->reasm_dead) { + /* duplicate start? drop everything */ + __mctp_key_unlock_drop(key, net, f); + rc = -EEXIST; + } else { + rc = mctp_frag_queue(key, skb); + spin_unlock_irqrestore(&key->reasm_lock, f); + } + } + + } else if (key) { + /* this packet continues a previous message; reassemble + * using the message-specific key + */ + + spin_lock_irqsave(&key->reasm_lock, f); + + /* we need to be continuing an existing reassembly... */ + if (!key->reasm_head) + rc = -EINVAL; + else + rc = mctp_frag_queue(key, skb); + + /* end of message? deliver to socket, and we're done with + * the reassembly/response key + */ + if (!rc && flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(key->sk, key->reasm_head); + key->reasm_head = NULL; + __mctp_key_unlock_drop(key, net, f); + } else { + spin_unlock_irqrestore(&key->reasm_lock, f); + } + + } else { + /* not a start, no matching key */ + rc = -ENOENT; + } + +out_unlock: + rcu_read_unlock(); +out: + if (rc) + kfree_skb(skb); + return rc; +} + +static unsigned int mctp_route_mtu(struct mctp_route *rt) +{ + return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); +} + +static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) +{ + struct mctp_hdr *hdr = mctp_hdr(skb); + char daddr_buf[MAX_ADDR_LEN]; + char *daddr = NULL; + unsigned int mtu; + int rc; + + skb->protocol = htons(ETH_P_MCTP); + + mtu = READ_ONCE(skb->dev->mtu); + if (skb->len > mtu) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* If lookup fails let the device handle daddr==NULL */ + if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + daddr = daddr_buf; + + rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + daddr, skb->dev->dev_addr, skb->len); + if (rc) { + kfree_skb(skb); + return -EHOSTUNREACH; + } + + rc = dev_queue_xmit(skb); + if (rc) + rc = net_xmit_errno(rc); + + return rc; +} + +/* route alloc/release */ +static void mctp_route_release(struct mctp_route *rt) +{ + if (refcount_dec_and_test(&rt->refs)) { + dev_put(rt->dev->dev); + kfree_rcu(rt, rcu); + } +} + +/* returns a route with the refcount at 1 */ +static struct mctp_route *mctp_route_alloc(void) +{ + struct mctp_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->list); + refcount_set(&rt->refs, 1); + rt->output = mctp_route_discard; + + return rt; +} + +unsigned int mctp_default_net(struct net *net) +{ + return READ_ONCE(net->mctp.default_net); +} + +int mctp_default_net_set(struct net *net, unsigned int index) +{ + if (index == 0) + return -EINVAL; + WRITE_ONCE(net->mctp.default_net, index); + return 0; +} + +/* tag management */ +static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, + struct mctp_sock *msk) +{ + struct netns_mctp *mns = &net->mctp; + + lockdep_assert_held(&mns->keys_lock); + + /* we hold the net->key_lock here, allowing updates to both + * then net and sk + */ + hlist_add_head_rcu(&key->hlist, &mns->keys); + hlist_add_head_rcu(&key->sklist, &msk->keys); +} + +/* Allocate a locally-owned tag value for (saddr, daddr), and reserve + * it for the socket msk + */ +static int mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp) +{ + struct net *net = sock_net(&msk->sk); + struct netns_mctp *mns = &net->mctp; + struct mctp_sk_key *key, *tmp; + unsigned long flags; + int rc = -EAGAIN; + u8 tagbits; + + /* be optimistic, alloc now */ + key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); + if (!key) + return -ENOMEM; + + /* 8 possible tag values */ + tagbits = 0xff; + + spin_lock_irqsave(&mns->keys_lock, flags); + + /* Walk through the existing keys, looking for potential conflicting + * tags. If we find a conflict, clear that bit from tagbits + */ + hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* if we don't own the tag, it can't conflict */ + if (tmp->tag & MCTP_HDR_FLAG_TO) + continue; + + if ((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr) + tagbits &= ~(1 << tmp->tag); + + if (!tagbits) + break; + } + + if (tagbits) { + key->tag = __ffs(tagbits); + mctp_reserve_tag(net, key, msk); + *tagp = key->tag; + rc = 0; + } + + spin_unlock_irqrestore(&mns->keys_lock, flags); + + if (!tagbits) + kfree(key); + + return rc; +} + +/* routing lookups */ +static bool mctp_rt_match_eid(struct mctp_route *rt, + unsigned int net, mctp_eid_t eid) +{ + return READ_ONCE(rt->dev->net) == net && + rt->min <= eid && rt->max >= eid; +} + +/* compares match, used for duplicate prevention */ +static bool mctp_rt_compare_exact(struct mctp_route *rt1, + struct mctp_route *rt2) +{ + ASSERT_RTNL(); + return rt1->dev->net == rt2->dev->net && + rt1->min == rt2->min && + rt1->max == rt2->max; +} + +struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *tmp, *rt = NULL; + + list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { + /* TODO: add metrics */ + if (mctp_rt_match_eid(tmp, dnet, daddr)) { + if (refcount_inc_not_zero(&tmp->refs)) { + rt = tmp; + break; + } + } + } + + return rt; +} + +/* sends a skb to rt and releases the route. */ +int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb) +{ + int rc; + + rc = rt->output(rt, skb); + mctp_route_release(rt); + return rc; +} + +static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, + unsigned int mtu, u8 tag) +{ + const unsigned int hlen = sizeof(struct mctp_hdr); + struct mctp_hdr *hdr, *hdr2; + unsigned int pos, size; + struct sk_buff *skb2; + int rc; + u8 seq; + + hdr = mctp_hdr(skb); + seq = 0; + rc = 0; + + if (mtu < hlen + 1) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* we've got the header */ + skb_pull(skb, hlen); + + for (pos = 0; pos < skb->len;) { + /* size of message payload */ + size = min(mtu - hlen, skb->len - pos); + + skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL); + if (!skb2) { + rc = -ENOMEM; + break; + } + + /* generic skb copy */ + skb2->protocol = skb->protocol; + skb2->priority = skb->priority; + skb2->dev = skb->dev; + memcpy(skb2->cb, skb->cb, sizeof(skb2->cb)); + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* establish packet */ + skb_reserve(skb2, MCTP_HEADER_MAXLEN); + skb_reset_network_header(skb2); + skb_put(skb2, hlen + size); + skb2->transport_header = skb2->network_header + hlen; + + /* copy header fields, calculate SOM/EOM flags & seq */ + hdr2 = mctp_hdr(skb2); + hdr2->ver = hdr->ver; + hdr2->dest = hdr->dest; + hdr2->src = hdr->src; + hdr2->flags_seq_tag = tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + if (pos == 0) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM; + + if (pos + size == skb->len) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM; + + hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT; + + /* copy message payload */ + skb_copy_bits(skb, pos, skb_transport_header(skb2), size); + + /* do route, but don't drop the rt reference */ + rc = rt->output(rt, skb2); + if (rc) + break; + + seq = (seq + 1) & MCTP_HDR_SEQ_MASK; + pos += size; + } + + mctp_route_release(rt); + consume_skb(skb); + return rc; +} + +int mctp_local_output(struct sock *sk, struct mctp_route *rt, + struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *hdr; + unsigned long flags; + unsigned int mtu; + mctp_eid_t saddr; + int rc; + u8 tag; + + if (WARN_ON(!rt->dev)) + return -EINVAL; + + spin_lock_irqsave(&rt->dev->addrs_lock, flags); + if (rt->dev->num_addrs == 0) { + rc = -EHOSTUNREACH; + } else { + /* use the outbound interface's first address as our source */ + saddr = rt->dev->addrs[0]; + rc = 0; + } + spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); + + if (rc) + return rc; + + if (req_tag & MCTP_HDR_FLAG_TO) { + rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag); + if (rc) + return rc; + tag |= MCTP_HDR_FLAG_TO; + } else { + tag = req_tag; + } + + + skb->protocol = htons(ETH_P_MCTP); + skb->priority = 0; + skb_reset_transport_header(skb); + skb_push(skb, sizeof(struct mctp_hdr)); + skb_reset_network_header(skb); + skb->dev = rt->dev->dev; + + /* cb->net will have been set on initial ingress */ + cb->src = saddr; + + /* set up common header fields */ + hdr = mctp_hdr(skb); + hdr->ver = 1; + hdr->dest = daddr; + hdr->src = saddr; + + mtu = mctp_route_mtu(rt); + + if (skb->len + sizeof(struct mctp_hdr) <= mtu) { + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | + tag; + return mctp_do_route(rt, skb); + } else { + return mctp_do_fragment_route(rt, skb, mtu, tag); + } +} + +/* route management */ +static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent, unsigned int mtu, + unsigned char type) +{ + int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb); + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *ert; + + if (!mctp_address_ok(daddr_start)) + return -EINVAL; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + switch (type) { + case RTN_LOCAL: + rtfn = mctp_route_input; + break; + case RTN_UNICAST: + rtfn = mctp_route_output; + break; + default: + return -EINVAL; + } + + rt = mctp_route_alloc(); + if (!rt) + return -ENOMEM; + + rt->min = daddr_start; + rt->max = daddr_start + daddr_extent; + rt->mtu = mtu; + rt->dev = mdev; + dev_hold(rt->dev->dev); + rt->type = type; + rt->output = rtfn; + + ASSERT_RTNL(); + /* Prevent duplicate identical routes. */ + list_for_each_entry(ert, &net->mctp.routes, list) { + if (mctp_rt_compare_exact(rt, ert)) { + mctp_route_release(rt); + return -EEXIST; + } + } + + list_add_rcu(&rt->list, &net->mctp.routes); + + return 0; +} + +static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + mctp_eid_t daddr_end; + bool dropped; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + daddr_end = daddr_start + daddr_extent; + dropped = false; + + ASSERT_RTNL(); + + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev && + rt->min == daddr_start && rt->max == daddr_end) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + dropped = true; + } + } + + return dropped ? 0 : -ENOENT; +} + +int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL); +} + +int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_remove(mdev, addr, 0); +} + +/* removes all entries for a given device */ +void mctp_route_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + + ASSERT_RTNL(); + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + } + } +} + +/* Incoming packet-handling */ + +static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct mctp_hdr *mh; + + /* basic non-data sanity checks */ + if (dev->type != ARPHRD_MCTP) + goto err_drop; + + if (!pskb_may_pull(skb, sizeof(struct mctp_hdr))) + goto err_drop; + + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + + /* We have enough for a header; decode and route */ + mh = mctp_hdr(skb); + if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) + goto err_drop; + + cb = __mctp_cb(skb); + rcu_read_lock(); + cb->net = READ_ONCE(__mctp_dev_get(dev)->net); + rcu_read_unlock(); + + rt = mctp_route_lookup(net, cb->net, mh->dest); + if (!rt) + goto err_drop; + + mctp_do_route(rt, skb); + + return NET_RX_SUCCESS; + +err_drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mctp_packet_type = { + .type = cpu_to_be16(ETH_P_MCTP), + .func = mctp_pkttype_receive, +}; + +/* netlink interface */ + +static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { + [RTA_DST] = { .type = NLA_U8 }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing. + * tb must hold RTA_MAX+1 elements. + */ +static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr **tb, struct rtmsg **rtm, + struct mctp_dev **mdev, mctp_eid_t *daddr_start) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + unsigned int ifindex; + int rc; + + rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, + rta_mctp_policy, extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[RTA_DST]) { + NL_SET_ERR_MSG(extack, "dst EID missing"); + return -EINVAL; + } + *daddr_start = nla_get_u8(tb[RTA_DST]); + + if (!tb[RTA_OIF]) { + NL_SET_ERR_MSG(extack, "ifindex missing"); + return -EINVAL; + } + ifindex = nla_get_u32(tb[RTA_OIF]); + + *rtm = nlmsg_data(nlh); + if ((*rtm)->rtm_family != AF_MCTP) { + NL_SET_ERR_MSG(extack, "route family must be AF_MCTP"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + + if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, "no routes to loopback"); + return -EINVAL; + } + + return 0; +} + +static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + unsigned int mtu; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + if (rtm->rtm_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); + return -EINVAL; + } + + /* TODO: parse mtu from nlparse */ + mtu = 0; + + if (rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + + rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, + rtm->rtm_type); + return rc; +} + +static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + /* we only have unicast routes */ + if (rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + + rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len); + return rc; +} + +static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, + u32 portid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *hdr; + void *metrics; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->rtm_family = AF_MCTP; + + /* we use the _len fields as a number of EIDs, rather than + * a number of bits in the address + */ + hdr->rtm_dst_len = rt->max - rt->min; + hdr->rtm_src_len = 0; + hdr->rtm_tos = 0; + hdr->rtm_table = RT_TABLE_DEFAULT; + hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ + hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ + hdr->rtm_type = rt->type; + + if (nla_put_u8(skb, RTA_DST, rt->min)) + goto cancel; + + metrics = nla_nest_start_noflag(skb, RTA_METRICS); + if (!metrics) + goto cancel; + + if (rt->mtu) { + if (nla_put_u32(skb, RTAX_MTU, rt->mtu)) + goto cancel; + } + + nla_nest_end(skb, metrics); + + if (rt->dev) { + if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) + goto cancel; + } + + /* TODO: conditional neighbour physaddr? */ + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mctp_route *rt; + int s_idx, idx; + + /* TODO: allow filtering on route data, possibly under + * cb->strict_check + */ + + /* TODO: change to struct overlay */ + s_idx = cb->args[0]; + idx = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (idx++ < s_idx) + continue; + if (mctp_fill_rtinfo(skb, rt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + break; + } + + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +/* net namespace implementation */ +static int __net_init mctp_routes_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->routes); + INIT_HLIST_HEAD(&ns->binds); + mutex_init(&ns->bind_lock); + INIT_HLIST_HEAD(&ns->keys); + spin_lock_init(&ns->keys_lock); + WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET)); + return 0; +} + +static void __net_exit mctp_routes_net_exit(struct net *net) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) + mctp_route_release(rt); +} + +static struct pernet_operations mctp_net_ops = { + .init = mctp_routes_net_init, + .exit = mctp_routes_net_exit, +}; + +int __init mctp_routes_init(void) +{ + dev_add_pack(&mctp_packet_type); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE, + NULL, mctp_dump_rtinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE, + mctp_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE, + mctp_delroute, NULL, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_routes_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_DELROUTE); + rtnl_unregister(PF_MCTP, RTM_NEWROUTE); + rtnl_unregister(PF_MCTP, RTM_GETROUTE); + dev_remove_pack(&mctp_packet_type); +} diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 05a21dd072df..ffeb2df8be7a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -407,7 +407,6 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; - dec.ttl -= 1; /* Find the output device */ out_dev = rcu_dereference(nh->nh_dev); @@ -431,6 +430,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); + dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 96ba616f59bf..8b235468c88f 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -4,7 +4,9 @@ * Copyright (c) 2019, Tessares SA. */ +#ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#endif #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -15,36 +17,68 @@ static int mptcp_pernet_id; struct mptcp_pernet { +#ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; +#endif - int mptcp_enabled; unsigned int add_addr_timeout; + unsigned int stale_loss_cnt; + u8 mptcp_enabled; + u8 checksum_enabled; + u8 allow_join_initial_addr_port; }; -static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } -int mptcp_is_enabled(struct net *net) +int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } -unsigned int mptcp_get_add_addr_timeout(struct net *net) +unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } +int mptcp_is_checksum_enabled(const struct net *net) +{ + return mptcp_get_pernet(net)->checksum_enabled; +} + +int mptcp_allow_join_id0(const struct net *net) +{ + return mptcp_get_pernet(net)->allow_join_initial_addr_port; +} + +unsigned int mptcp_stale_loss_cnt(const struct net *net) +{ + return mptcp_get_pernet(net)->stale_loss_cnt; +} + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->checksum_enabled = 0; + pernet->allow_join_initial_addr_port = 1; + pernet->stale_loss_cnt = 4; +} + +#ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", @@ -52,15 +86,31 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "checksum_enabled", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "allow_join_initial_addr_port", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "stale_loss_cnt", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, {} }; -static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) -{ - pernet->mptcp_enabled = 1; - pernet->add_addr_timeout = TCP_RTO_MAX; -} - static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; @@ -75,6 +125,9 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; + table[2].data = &pernet->checksum_enabled; + table[3].data = &pernet->allow_join_initial_addr_port; + table[4].data = &pernet->stale_loss_cnt; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) @@ -100,6 +153,17 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) kfree(table); } +#else + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + return 0; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} + +#endif /* CONFIG_SYSCTL */ + static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index eb2dc6dbe212..b21ff9be04c6 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), + SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), @@ -42,6 +44,11 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), + SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), + SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), + SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), + SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), + SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index f0da4f060fe1..ecd3d8b117e0 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,8 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ + MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ @@ -35,6 +37,11 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ + MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ + MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ + MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ + MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ + MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f16d9b5ee978..f48eb6315bbb 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -57,10 +57,8 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb, kfree_skb(rep); goto out; } - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + out: sock_put(sk); @@ -144,6 +142,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9b263f27ce9b..c41273cefc51 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb, else expected_opsize = TCPOLEN_MPTCP_MPC_SYN; } - if (opsize != expected_opsize) + + /* Cfr RFC 8684 Section 3.3.0: + * If a checksum is present but its use had + * not been negotiated in the MP_CAPABLE handshake, the receiver MUST + * close the subflow with a RST, as it is not behaving as negotiated. + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST, as it is considered + * broken + * We parse even option with mismatching csum presence, so that + * later in subflow_data_ready we can trigger the reset. + */ + if (opsize != expected_opsize && + (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || + opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ @@ -66,18 +79,13 @@ static void mptcp_parse_option(const struct sk_buff *skb, * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." - * - * Section 3.3.0: - * "If a checksum is not present when its use has been - * negotiated, the receiver MUST close the subflow with a RST as - * it is considered broken." - * - * We don't implement DSS checksum - fall back to TCP. */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - break; + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; + + mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); - mp_opt->mp_capable = 1; + mp_opt->suboptions |= OPTIONS_MPTCP_MPC; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; @@ -86,25 +94,30 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } - if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ - mp_opt->dss = 1; + mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, - mp_opt->rcvr_key, mp_opt->data_len); + mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: - mp_opt->mp_join = 1; + mp_opt->suboptions |= OPTIONS_MPTCP_MPJ; if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; @@ -130,7 +143,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); } else { - mp_opt->mp_join = 0; + mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ; } break; @@ -171,17 +184,14 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } - /* RFC 6824, Section 3.3: - * If a checksum is present, but its use had - * not been negotiated in the MP_CAPABLE handshake, - * the checksum field MUST be ignored. + /* Always parse any csum presence combination, we will enforce + * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; - mp_opt->dss = 1; - + mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); @@ -209,9 +219,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + ptr += 2; + } + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len); + mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), + mp_opt->csum); } break; @@ -242,8 +259,10 @@ static void mptcp_parse_option(const struct sk_buff *skb, break; } - mp_opt->add_addr = 1; + mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; + mp_opt->addr.port = 0; + mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -280,7 +299,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr++; - mp_opt->rm_addr = 1; + mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; @@ -291,7 +310,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize != TCPOLEN_MPTCP_PRIO) break; - mp_opt->mp_prio = 1; + mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d", mp_opt->backup); break; @@ -303,7 +322,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; - mp_opt->fastclose = 1; + mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; break; case MPTCPOPT_RST: @@ -312,18 +331,30 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; - mp_opt->reset = 1; + + mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; break; + case MPTCPOPT_MP_FAIL: + if (opsize != TCPOLEN_MPTCP_FAIL) + break; + + ptr += 2; + mp_opt->suboptions |= OPTION_MPTCP_FAIL; + mp_opt->fail_seq = get_unaligned_be64(ptr); + pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); + break; + default: break; } } -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); @@ -331,16 +362,7 @@ void mptcp_get_options(const struct sk_buff *skb, int length; /* initialize option status */ - mp_opt->mp_capable = 0; - mp_opt->mp_join = 0; - mp_opt->add_addr = 0; - mp_opt->ahmac = 0; - mp_opt->fastclose = 0; - mp_opt->addr.port = 0; - mp_opt->rm_addr = 0; - mp_opt->dss = 0; - mp_opt->mp_prio = 0; - mp_opt->reset = 0; + mp_opt->suboptions = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -382,6 +404,8 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -437,8 +461,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; + u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than @@ -467,16 +493,27 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ - if (data_len > 0) - *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); - else + if (data_len > 0) { + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + if (opts->csum_reqd) { + /* we need to propagate more info to csum the pseudo hdr */ + opts->ext_copy.data_seq = mpext->data_seq; + opts->ext_copy.subflow_seq = mpext->subflow_seq; + opts->ext_copy.csum = mpext->csum; + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + *size = ALIGN(len, 4); + } else { *size = TCPOLEN_MPTCP_MPC_ACK; + } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, @@ -537,20 +574,24 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; u64 ack_seq; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { - unsigned int map_size; + unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; - map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + if (mpext) { + if (opts->csum_reqd) + map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; - remaining -= map_size; - dss_size = map_size; - if (mpext) opts->ext_copy = *mpext; + } + remaining -= map_size; + dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); + opts->suboptions = OPTION_MPTCP_DSS; ret = true; } @@ -574,6 +615,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + opts->suboptions = OPTION_MPTCP_DSS; WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ @@ -626,29 +668,34 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * bool port; int len; - if ((mptcp_pm_should_add_signal_ipv6(msk) || - mptcp_pm_should_add_signal_port(msk) || - mptcp_pm_should_add_signal_echo(msk)) && - skb && skb_is_tcp_pure_ack(skb)) { - pr_debug("drop other suboptions"); - opts->suboptions = 0; - opts->ext_copy.use_ack = 0; - opts->ext_copy.use_map = 0; - remaining += opt_size; - drop_other_suboptions = true; - } - + /* add addr will strip the existing options, be sure to avoid breaking + * MPC/MPJ handshakes + */ if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &opts->addr, &echo, &port))) + (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + &echo, &port, &drop_other_suboptions)) return false; + if (drop_other_suboptions) + remaining += opt_size; len = mptcp_add_addr_len(opts->addr.family, echo, port); if (remaining < len) return false; *size = len; - if (drop_other_suboptions) + if (drop_other_suboptions) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + + /* note that e.g. DSS could have written into the memory + * aliased by ahmac, we must reset the field here + * to avoid appending the hmac even for ADD_ADDR echo + * options + */ + opts->ahmac = 0; *size -= opt_size; + } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { opts->ahmac = add_addr_generate_hmac(msk->local_key, @@ -698,7 +745,12 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - if (!subflow->send_mp_prio) + /* can't send MP_PRIO with MPC, as they share the same option space: + * 'backup'. Also it makes no sense at all + */ + if (!subflow->send_mp_prio || + ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | + OPTION_MPTCP_MPC_ACK) & opts->suboptions)) return false; /* account for the trailing 'nop' option */ @@ -714,7 +766,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } -static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, +static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -722,12 +774,36 @@ static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_bu const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) - return; + return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + + return true; +} + +static bool mptcp_established_options_mp_fail(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (likely(!subflow->send_mp_fail)) + return false; + + if (remaining < TCPOLEN_MPTCP_FAIL) + return false; + + *size = TCPOLEN_MPTCP_FAIL; + opts->suboptions |= OPTION_MPTCP_FAIL; + opts->fail_seq = subflow->map_seq; + + pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + + return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, @@ -746,15 +822,28 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { - mptcp_established_options_rst(sk, skb, size, remaining, opts); + if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + } + if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) { ret = true; + if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + return true; + } + } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal @@ -791,6 +880,8 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; + opts->csum_reqd = subflow_req->csum_reqd; + opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -825,7 +916,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && - subflow->mp_join && mp_opt->mp_join && + subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && READ_ONCE(msk->pm.server_side)) tcp_send_ack(ssk); goto fully_established; @@ -842,25 +933,21 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return subflow->mp_capable; } - if (mp_opt->dss && mp_opt->use_ack) { + if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) { /* subflows are fully established as soon as we get any - * additional ack. + * additional ack, including ADD_ADDR. */ subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); goto fully_established; } - if (mp_opt->add_addr) { - WRITE_ONCE(msk->fully_established, true); - return true; - } - /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ - if (!mp_opt->mp_capable) { + if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; @@ -869,6 +956,9 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); @@ -896,19 +986,20 @@ reset: return false; } -static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { - u32 old_ack32, cur_ack32; - - if (use_64bit) - return cur_ack; - - old_ack32 = (u32)old_ack; - cur_ack32 = (u32)cur_ack; - cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; - if (unlikely(before(cur_ack32, old_ack32))) - return cur_ack + (1LL << 32); - return cur_ack; + u32 old_seq32, cur_seq32; + + old_seq32 = (u32)old_seq; + cur_seq32 = (u32)cur_seq; + cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; + if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) + return cur_seq + (1LL << 32); + + /* reverse wrap could happen, too */ + if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) + return cur_seq - (1LL << 32); + return cur_seq; } static void ack_update_msk(struct mptcp_sock *msk, @@ -926,11 +1017,13 @@ static void ack_update_msk(struct mptcp_sock *msk, * more dangerous than missing an ack */ old_snd_una = msk->snd_una; - new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); - /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, snd_nxt)) - new_snd_una = old_snd_una; + /* ACK for data not even sent yet and even above recovery bound? Ignore.*/ + if (unlikely(after64(new_snd_una, snd_nxt))) { + if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt)) + new_snd_una = old_snd_una; + } new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; @@ -963,7 +1056,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us return false; WRITE_ONCE(msk->rcv_data_fin_seq, - expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); + mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; @@ -988,7 +1081,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, return hmac == mp_opt->ahmac; } -void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) +/* Return false if a subflow has been reset, else return true */ +bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); @@ -1006,55 +1100,62 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) __mptcp_check_push(subflow->conn, sk); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); - return; + return true; } - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); + + /* The subflow can be in close state only if check_fully_established() + * just sent a reset. If so, tell the caller to ignore the current packet. + */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) - return; + return sk->sk_state != TCP_CLOSE; - if (mp_opt.fastclose && - msk->local_key == mp_opt.rcvr_key) { - WRITE_ONCE(msk->rcv_fastclose, true); - mptcp_schedule_work((struct sock *)msk); - } + if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { + if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } - if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { - if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &mp_opt.addr); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); - } else { - mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); - mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); + if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && + add_addr_hmac_valid(msk, &mp_opt)) { + if (!mp_opt.echo) { + mptcp_pm_add_addr_received(msk, &mp_opt.addr); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); + } else { + mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); + mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); + } + + if (mp_opt.addr.port) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } - if (mp_opt.addr.port) - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); + if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) + mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); - mp_opt.add_addr = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { + mptcp_pm_mp_prio_received(sk, mp_opt.backup); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); + } - if (mp_opt.rm_addr) { - mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); - mp_opt.rm_addr = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { + mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); + } - if (mp_opt.mp_prio) { - mptcp_pm_mp_prio_received(sk, mp_opt.backup); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); - mp_opt.mp_prio = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_RST) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } - if (mp_opt.reset) { - subflow->reset_seen = 1; - subflow->reset_reason = mp_opt.reset_reason; - subflow->reset_transient = mp_opt.reset_transient; + if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) + return true; } - if (!mp_opt.dss) - return; - /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ @@ -1072,16 +1173,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) schedule_work(&msk->work)) sock_hold(subflow->conn); - return; + return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) - return; + return true; memset(mpext, 0, sizeof(*mpext)); - if (mp_opt.use_map) { + if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data @@ -1101,7 +1202,13 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; + mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); + + if (mpext->csum_reqd) + mpext->csum = mp_opt.csum; } + + return true; } static void mptcp_set_rwin(const struct tcp_sock *tp) @@ -1120,25 +1227,133 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + struct csum_pseudo_header header; + __wsum csum; + + /* cfr RFC 8684 3.3.1.: + * the data sequence number used in the pseudo-header is + * always the 64-bit value, irrespective of what length is used in the + * DSS option itself. + */ + header.data_seq = cpu_to_be64(mpext->data_seq); + header.subflow_seq = htonl(mpext->subflow_seq); + header.data_len = htons(mpext->data_len); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + return (__force u16)csum_fold(csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { - if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions) { - u8 len; + if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_fail = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, + TCPOLEN_MPTCP_FAIL, + 0, 0); + put_unaligned_be64(opts->fail_seq, ptr); + ptr += 2; + } + + /* RST is mutually exclusive with everything else */ + if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + return; + } + + /* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see + * mptcp_established_options*() + */ + if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + flags = MPTCP_DSS_HAS_ACK; + if (mpext->ack64) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags |= MPTCP_DSS_ACK64; + } else { + len += TCPOLEN_MPTCP_DSS_ACK32; + } + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + + *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); - if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + if (mpext->use_ack) { + if (mpext->ack64) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } else { + put_unaligned_be32(mpext->data_ack32, ptr); + ptr += 1; + } + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } + } else if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | + OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + u8 len, flag = MPTCP_CAP_HMAC_SHA256; + + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; - else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - else if (opts->ext_copy.data_len) + } else if (opts->ext_copy.data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; - else + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } else { len = TCPOLEN_MPTCP_MPC_ACK; + } + + if (opts->csum_reqd) + flag |= MPTCP_CAP_CHECKSUM_REQD; + + if (!opts->allow_join_id0) + flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, - MPTCP_CAP_HMAC_SHA256); + flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -1154,13 +1369,39 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (!opts->ext_copy.data_len) goto mp_capable_done; - put_unaligned_be32(opts->ext_copy.data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + mptcp_make_csum(&opts->ext_copy), ptr); + } else { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } ptr += 1; - } -mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + /* MPC is additionally mutually exclusive with MP_PRIO */ + goto mp_capable_done; + } else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYN, + opts->backup, opts->join_id); + put_unaligned_be32(opts->token, ptr); + ptr += 1; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYNACK, + opts->backup, opts->join_id); + put_unaligned_be64(opts->thmac, ptr); + ptr += 2; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_ACK, 0, 0); + memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); + ptr += 5; + } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; @@ -1218,6 +1459,19 @@ mp_capable_done: } } + if (OPTION_MPTCP_PRIO & opts->suboptions) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_prio = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, + TCPOLEN_MPTCP_PRIO, + opts->backup, TCPOPT_NOP); + } + +mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; @@ -1238,99 +1492,6 @@ mp_capable_done: } } - if (OPTION_MPTCP_PRIO & opts->suboptions) { - const struct sock *ssk = (const struct sock *)tp; - struct mptcp_subflow_context *subflow; - - subflow = mptcp_subflow_ctx(ssk); - subflow->send_mp_prio = 0; - - *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, - TCPOLEN_MPTCP_PRIO, - opts->backup, TCPOPT_NOP); - } - - if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYN, - opts->backup, opts->join_id); - put_unaligned_be32(opts->token, ptr); - ptr += 1; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } - - if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYNACK, - opts->backup, opts->join_id); - put_unaligned_be64(opts->thmac, ptr); - ptr += 2; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } - - if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_ACK, 0, 0); - memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); - ptr += 5; - } - - if (OPTION_MPTCP_RST & opts->suboptions) - *ptr++ = mptcp_option(MPTCPOPT_RST, - TCPOLEN_MPTCP_RST, - opts->reset_transient, - opts->reset_reason); - - if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { - struct mptcp_ext *mpext = &opts->ext_copy; - u8 len = TCPOLEN_MPTCP_DSS_BASE; - u8 flags = 0; - - if (mpext->use_ack) { - flags = MPTCP_DSS_HAS_ACK; - if (mpext->ack64) { - len += TCPOLEN_MPTCP_DSS_ACK64; - flags |= MPTCP_DSS_ACK64; - } else { - len += TCPOLEN_MPTCP_DSS_ACK32; - } - } - - if (mpext->use_map) { - len += TCPOLEN_MPTCP_DSS_MAP64; - - /* Use only 64-bit mapping flags for now, add - * support for optional 32-bit mappings later. - */ - flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; - if (mpext->data_fin) - flags |= MPTCP_DSS_DATA_FIN; - } - - *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); - - if (mpext->use_ack) { - if (mpext->ack64) { - put_unaligned_be64(mpext->data_ack, ptr); - ptr += 2; - } else { - put_unaligned_be32(mpext->data_ack32, ptr); - ptr += 1; - } - } - - if (mpext->use_map) { - put_unaligned_be64(mpext->data_seq, ptr); - ptr += 2; - put_unaligned_be32(mpext->subflow_seq, ptr); - ptr += 1; - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); - } - } - if (tp) mptcp_set_rwin(tp); } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 9d00fa6d22e9..6ab386ff3294 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,6 +10,8 @@ #include <net/mptcp.h> #include "protocol.h" +#include "mib.h" + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -18,23 +20,23 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, { u8 add_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d", msk, addr->id); + pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); - if (add_addr) { - pr_warn("addr_signal error, add_addr=%d", add_addr); + if (add_addr & + (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { + pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo); return -EINVAL; } - msk->pm.local = *addr; - add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); - if (echo) + if (echo) { + msk->pm.remote = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); - if (addr->family == AF_INET6) - add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); - if (addr->port) - add_addr |= BIT(MPTCP_ADD_ADDR_PORT); + } else { + msk->pm.local = *addr; + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + } WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } @@ -247,12 +249,21 @@ void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); } +void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) +{ + pr_debug("fail_seq=%llu", fail_seq); +} + /* path manager helpers */ -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo, bool *port) +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, + unsigned int opt_size, unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo, + bool *port, bool *drop_other_suboptions) { int ret = false; + u8 add_addr; + u8 family; spin_lock_bh(&msk->pm.lock); @@ -260,14 +271,30 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; + /* always drop every other options for pure ack ADD_ADDR; this is a + * plain dup-ack from TCP perspective. The other MPTCP-relevant info, + * if any, will be carried by the 'original' TCP ack + */ + if (skb && skb_is_tcp_pure_ack(skb)) { + remaining += opt_size; + *drop_other_suboptions = true; + } + *echo = mptcp_pm_should_add_signal_echo(msk); - *port = mptcp_pm_should_add_signal_port(msk); + *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); - if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port)) + family = *echo ? msk->pm.remote.family : msk->pm.local.family; + if (remaining < mptcp_add_addr_len(family, *echo, *port)) goto out_unlock; - *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.addr_signal, 0); + if (*echo) { + *addr = msk->pm.remote; + add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); + } else { + *addr = msk->pm.local; + add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); + } + WRITE_ONCE(msk->pm.addr_signal, add_addr); ret = true; out_unlock: @@ -279,6 +306,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list) { int ret = false, len; + u8 rm_addr; spin_lock_bh(&msk->pm.lock); @@ -286,16 +314,17 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; + rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); if (len < 0) { - WRITE_ONCE(msk->pm.addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); goto out_unlock; } if (remaining < len) goto out_unlock; *rm_list = msk->pm.rm_list_tx; - WRITE_ONCE(msk->pm.addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); ret = true; out_unlock: @@ -308,6 +337,25 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } +void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); + + /* keep track of rtx periods with no progress */ + if (!subflow->stale_count) { + subflow->stale_rcv_tstamp = rcv_tstamp; + subflow->stale_count++; + } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { + if (subflow->stale_count < U8_MAX) + subflow->stale_count++; + mptcp_pm_nl_subflow_chk_stale(msk, ssk); + } else { + subflow->stale_count = 0; + mptcp_subflow_set_active(subflow); + } +} + void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; @@ -320,6 +368,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); + WRITE_ONCE(msk->pm.remote_deny_join_id0, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2469e06a3a9d..c4f9a5ce3815 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -27,7 +27,6 @@ struct mptcp_pm_addr_entry { struct mptcp_addr_info addr; u8 flags; int ifindex; - struct rcu_head rcu; struct socket *lsk; }; @@ -47,6 +46,7 @@ struct pm_nl_pernet { spinlock_t lock; struct list_head local_addr_list; unsigned int addrs; + unsigned int stale_loss_cnt; unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; @@ -317,14 +317,14 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!entry->addr.id) return; - if (mptcp_pm_should_add_signal(msk)) { + if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } spin_lock_bh(&msk->pm.lock); - if (!mptcp_pm_should_add_signal(msk)) { + if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); @@ -410,6 +410,55 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } +static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr, + struct mptcp_addr_info *addr) +{ + int i; + + for (i = 0; i < nr; i++) { + if (addresses_equal(&addrs[i], addr, addr->port)) + return true; + } + + return false; +} + +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, + struct mptcp_addr_info *addrs) +{ + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info remote = { 0 }; + unsigned int subflows_max; + int i = 0; + + subflows_max = mptcp_pm_get_subflows_max(msk); + + /* Non-fullmesh endpoint, fill in the single entry + * corresponding to the primary MPC subflow remote address + */ + if (!fullmesh) { + remote_address((struct sock_common *)sk, &remote); + msk->pm.subflows++; + addrs[i++] = remote; + } else { + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + remote_address((struct sock_common *)ssk, &remote); + if (!lookup_address_in_vec(addrs, i, &remote) && + msk->pm.subflows < subflows_max) { + msk->pm.subflows++; + addrs[i++] = remote; + } + } + } + + return i; +} + static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -451,18 +500,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check if should create a new subflow */ if (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.subflows < subflows_max && + !READ_ONCE(msk->pm.remote_deny_join_id0)) { local = select_local_address(pernet, msk); if (local) { - struct mptcp_addr_info remote = { 0 }; + bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + int i, nr; msk->pm.local_addr_used++; - msk->pm.subflows++; check_work_pending(msk); - remote_address((struct sock_common *)sk, &remote); + nr = fill_remote_addresses_vec(msk, fullmesh, addrs); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local->addr, &remote, - local->flags, local->ifindex); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); spin_lock_bh(&msk->pm.lock); return; } @@ -483,13 +534,67 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) mptcp_pm_create_subflow_or_signal_addr(msk); } +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *addrs) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info local; + struct pm_nl_pernet *pernet; + unsigned int subflows_max; + int i = 0; + + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + subflows_max = mptcp_pm_get_subflows_max(msk); + + rcu_read_lock(); + __mptcp_flush_join_list(msk); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) + continue; + + if (entry->addr.family != sk->sk_family) { +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if ((entry->addr.family == AF_INET && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || + (sk->sk_family == AF_INET && + !ipv6_addr_v4mapped(&entry->addr.addr6))) +#endif + continue; + } + + if (msk->pm.subflows < subflows_max) { + msk->pm.subflows++; + addrs[i++] = entry->addr; + } + } + rcu_read_unlock(); + + /* If the array is empty, fill in the single + * 'IPADDRANY' local address + */ + if (!i) { + memset(&local, 0, sizeof(local)); + local.family = msk->pm.remote.family; + + msk->pm.subflows++; + addrs[i++] = local; + } + + return i; +} + static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - struct mptcp_addr_info local; unsigned int subflows_max; + int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); @@ -501,23 +606,22 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) goto add_addr_echo; - msk->pm.add_addr_accepted++; - msk->pm.subflows++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; - memset(&local, 0, sizeof(local)); - local.family = remote.family; + nr = fill_local_addresses_vec(msk, addrs); + + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local, &remote, 0, 0); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); add_addr_echo: @@ -542,14 +646,10 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) struct sock *ssk = mptcp_subflow_tcp_sock(subflow); spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for %s%s%s", - mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr", - mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", - mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); + pr_debug("send ack for %s", + mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"); + + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); } } @@ -578,9 +678,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); return 0; @@ -897,6 +995,43 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, }; +void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + unsigned int active_max_loss_cnt; + struct net *net = sock_net(sk); + unsigned int stale_loss_cnt; + bool slow; + + stale_loss_cnt = mptcp_stale_loss_cnt(net); + if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) + return; + + /* look for another available subflow not in loss state */ + active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); + mptcp_for_each_subflow(msk, iter) { + if (iter != subflow && mptcp_subflow_active(iter) && + iter->stale_count < active_max_loss_cnt) { + /* we have some alternatives, try to mark this subflow as idle ...*/ + slow = lock_sock_fast(ssk); + if (!tcp_rtx_and_write_queues_empty(ssk)) { + subflow->stale = 1; + __mptcp_retransmit_pending_data(sk); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE); + } + unlock_sock_fast(ssk, slow); + + /* always try to push the pending data regarless of re-injections: + * we can possibly use backup subflows now, and subflow selection + * is cheap under the msk socket lock + */ + __mptcp_push_pending(sk, 0); + return; + } + } +} + static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -971,8 +1106,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); - if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal when using port"); + return -EINVAL; + } entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + } return 0; } @@ -1059,6 +1200,27 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) return NULL; } +int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, + u8 *flags, int *ifindex) +{ + struct mptcp_pm_addr_entry *entry; + + *flags = 0; + *ifindex = 0; + + if (id) { + rcu_read_lock(); + entry = __lookup_addr_by_id(net_generic(net, pm_nl_pernet_id), id); + if (entry) { + *flags = entry->flags; + *ifindex = entry->ifindex; + } + rcu_read_unlock(); + } + + return 0; +} + static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { @@ -1127,36 +1289,12 @@ next: return 0; } -struct addr_entry_release_work { - struct rcu_work rwork; - struct mptcp_pm_addr_entry *entry; -}; - -static void mptcp_pm_release_addr_entry(struct work_struct *work) -{ - struct addr_entry_release_work *w; - struct mptcp_pm_addr_entry *entry; - - w = container_of(to_rcu_work(work), struct addr_entry_release_work, rwork); - entry = w->entry; - if (entry) { - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); - } - kfree(w); -} - -static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry) +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) { - struct addr_entry_release_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - INIT_RCU_WORK(&w->rwork, mptcp_pm_release_addr_entry); - w->entry = entry; - queue_rcu_work(system_wq, &w->rwork); - } + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); } static int mptcp_nl_remove_id_zero_address(struct net *net, @@ -1236,7 +1374,8 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); - mptcp_pm_free_addr_entry(entry); + synchronize_rcu(); + __mptcp_pm_release_addr_entry(entry); return ret; } @@ -1289,6 +1428,7 @@ static void mptcp_nl_remove_addrs_list(struct net *net, } } +/* caller must ensure the RCU grace period is already elapsed */ static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { @@ -1297,7 +1437,7 @@ static void __flush_addrs(struct list_head *list) cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); list_del_rcu(&cur->list); - mptcp_pm_free_addr_entry(cur); + __mptcp_pm_release_addr_entry(cur); } } @@ -1321,6 +1461,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); __flush_addrs(&free_list); return 0; } @@ -1913,10 +2054,14 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - __reset_counters(pernet); pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); + pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + return 0; } @@ -1928,7 +2073,8 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); /* net is removed from namespace list, can't race with - * other modifiers + * other modifiers, also netns core already waited for a + * RCU grace period. */ __flush_addrs(&pernet->local_addr_list); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 632350018fb6..2602f1386160 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -39,10 +39,15 @@ struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; + u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +enum { + MPTCP_CMSG_TS = BIT(0), +}; + static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); @@ -272,6 +277,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; + bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); @@ -289,6 +295,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, sk->sk_forward_alloc += amount; } + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -296,6 +304,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -402,78 +411,93 @@ static void mptcp_set_datafin_timeout(const struct sock *sk) TCP_RTO_MIN << icsk->icsk_retransmits); } -static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +static void __mptcp_set_timeout(struct sock *sk, long tout) { - long tout = ssk && inet_csk(ssk)->icsk_pending ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; - - if (tout <= 0) - tout = mptcp_sk(sk)->timer_ival; mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } +static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) +{ + const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; +} + +static void mptcp_set_timeout(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + long tout = 0; + + mptcp_for_each_subflow(mptcp_sk(sk), subflow) + tout = max(tout, mptcp_timeout_from_subflow(subflow)); + __mptcp_set_timeout(sk, tout); +} + static bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } +void mptcp_subflow_send_ack(struct sock *ssk) +{ + bool slow; + + slow = lock_sock_fast(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); - release_sock(ssk); - } + mptcp_for_each_subflow(msk, subflow) + mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { - int ret; + bool slow; - lock_sock(ssk); - ret = tcp_can_send_ack(ssk); - if (ret) + slow = lock_sock_fast(ssk); + if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); - release_sock(ssk); - return ret; + unlock_sock_fast(ssk, slow); +} + +static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) +{ + const struct inet_connection_sock *icsk = inet_csk(ssk); + u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); + const struct tcp_sock *tp = tcp_sk(ssk); + + return (ack_pending & ICSK_ACK_SCHED) && + ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > + READ_ONCE(icsk->icsk_ack.rcv_mss)) || + (rx_empty && ack_pending & + (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { - struct sock *ack_hint = READ_ONCE(msk->ack_hint); int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - bool cleanup; + int space = __mptcp_space(sk); + bool cleanup, rx_empty; - /* this is a simple superset of what tcp_cleanup_rbuf() implements - * so that we don't have to acquire the ssk socket lock most of the time - * to do actually nothing - */ - cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); - if (!cleanup) - return; + cleanup = (space > 0) && (space >= (old_space << 1)); + rx_empty = !__mptcp_rmem(sk); - /* if the hinted ssk is still active, try to use it */ - if (likely(ack_hint)) { - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) - return; - } + if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) + mptcp_subflow_cleanup_rbuf(ssk); } - - /* otherwise pick the first active subflow */ - mptcp_for_each_subflow(msk, subflow) - if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) - return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -523,7 +547,6 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_set_timeout(sk, NULL); mptcp_send_ack(msk); mptcp_close_wake_up(sk); } @@ -618,7 +641,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; @@ -675,9 +697,6 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (inet_sk_state_load(sk) == TCP_CLOSE) - return false; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { @@ -716,8 +735,10 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) + if (__mptcp_rmem(sk) > sk_rbuf) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); return; + } /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); @@ -785,10 +806,7 @@ static void mptcp_reset_timer(struct sock *sk) if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; - /* should never be called with mptcp level timer cleared */ - tout = READ_ONCE(mptcp_sk(sk)->timer_ival); - if (WARN_ON_ONCE(!tout)) - tout = TCP_RTO_MIN; + tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } @@ -893,22 +911,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(struct sock *sk, int size) +static int mptcp_wmem_with_overhead(int size) { - struct mptcp_sock *msk = mptcp_sk(sk); - int ret, skbs; - - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; - if (skbs < msk->skb_tx_cache.qlen) - return ret; - - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(sk, size); + int amount = mptcp_wmem_with_overhead(size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -996,6 +1006,13 @@ static void mptcp_wmem_uncharge(struct sock *sk, int size) msk->wmem_reserved += size; } +static void __mptcp_mem_reclaim_partial(struct sock *sk) +{ + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); +} + static void mptcp_mem_reclaim_partial(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1048,8 +1065,14 @@ static void __mptcp_clean_una(struct sock *sk) if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; - if (WARN_ON_ONCE(dfrag == msk->first_pending)) - break; + if (unlikely(dfrag == msk->first_pending)) { + /* in recovery mode can see ack after the current snd head */ + if (WARN_ON_ONCE(!msk->recovery)) + break; + + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + dfrag_clear(sk, dfrag); cleaned = true; } @@ -1058,8 +1081,14 @@ static void __mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->already_sent)) - goto out; + /* prevent wrap around in recovery mode */ + if (unlikely(delta > dfrag->already_sent)) { + if (WARN_ON_ONCE(!msk->recovery)) + goto out; + if (WARN_ON_ONCE(delta > dfrag->data_len)) + goto out; + dfrag->already_sent += delta - dfrag->already_sent; + } dfrag->data_seq += delta; dfrag->offset += delta; @@ -1070,16 +1099,16 @@ static void __mptcp_clean_una(struct sock *sk) cleaned = true; } + /* all retransmitted data acked, recovery completed */ + if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) + msk->recovery = false; + out: - if (cleaned) { - if (tcp_under_memory_pressure(sk)) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - } + if (cleaned && tcp_under_memory_pressure(sk)) + __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt)) { - if (msk->timer_ival && !mptcp_data_fin_enabled(msk)) + if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) { + if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { mptcp_reset_timer(sk); @@ -1156,6 +1185,7 @@ struct mptcp_sendmsg_info { u16 limit; u16 sent; unsigned int flags; + bool data_lock_held; }; static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, @@ -1203,49 +1233,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool mptcp_tx_cache_refill(struct sock *sk, int size, - struct sk_buff_head *skbs, int *total_ts) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sk_buff *skb; - int space_needed; - - if (unlikely(tcp_under_memory_pressure(sk))) { - mptcp_mem_reclaim_partial(sk); - - /* under pressure pre-allocate at most a single skb */ - if (msk->skb_tx_cache.qlen) - return true; - space_needed = msk->size_goal_cache; - } else { - space_needed = msk->tx_pending_data + size - - msk->skb_tx_cache.qlen * msk->size_goal_cache; - } - - while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); - if (unlikely(!skb)) { - /* under memory pressure, try to pass the caller a - * single skb to allow forward progress - */ - while (skbs->qlen > 1) { - skb = __skb_dequeue_tail(skbs); - *total_ts -= skb->truesize; - __kfree_skb(skb); - } - return skbs->qlen > 0; - } - - *total_ts += skb->truesize; - __skb_queue_tail(skbs, skb); - space_needed -= msk->size_goal_cache; - } - return true; -} - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { - struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; if (ssk->sk_tx_skb_cache) { @@ -1256,22 +1245,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return true; } - skb = skb_peek(&msk->skb_tx_cache); - if (skb) { - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - skb = __skb_dequeue(&msk->skb_tx_cache); - if (WARN_ON_ONCE(!skb)) - return false; - - mptcp_wmem_uncharge(sk, skb->truesize); - ssk->sk_tx_skb_cache = skb; - return true; - } - - /* over memory limit, no point to try to allocate a new skb */ - return false; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1284,18 +1257,29 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return false; } -static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { - return !ssk->sk_tx_skb_cache && - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && - tcp_under_memory_pressure(sk); + gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; + + if (unlikely(tcp_under_memory_pressure(sk))) { + if (data_lock_held) + __mptcp_mem_reclaim_partial(sk); + else + mptcp_mem_reclaim_partial(sk); + } + return __mptcp_alloc_tx_skb(sk, ssk, gfp); } -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +/* note: this always recompute the csum on the whole skb, even + * if we just appended a single frag. More status info needed + */ +static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) - mptcp_mem_reclaim_partial(sk); - return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); + struct mptcp_ext *mpext = mptcp_get_ext(skb); + __wsum csum = ~csum_unfold(mpext->csum); + int offset = skb->len - added; + + mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, @@ -1307,7 +1291,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; - bool can_collapse = false; + bool must_collapse = false; int size_bias = 0; int avail_size; size_t ret = 0; @@ -1318,7 +1302,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; - msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1328,16 +1311,24 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - can_collapse = (info->size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) { + if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; - } else { + goto alloc_skb; + } + + must_collapse = (info->size_goal - skb->len > 0) && + (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); + if (must_collapse) { size_bias = skb->len; avail_size = info->size_goal - skb->len; } } +alloc_skb: + if (!must_collapse && !ssk->sk_tx_skb_cache && + !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) + return 0; + /* Zero window and all data acked? Probe. */ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); if (avail_size == 0) { @@ -1367,7 +1358,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (skb == tail) { TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; - WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; } @@ -1392,10 +1382,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mpext->frozen = 1; - ret = 0; + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); tcp_push_pending_frames(ssk); + return 0; } out: + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } @@ -1411,16 +1405,44 @@ struct subflow_send_info { u64 ratio; }; +void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) +{ + if (!subflow->stale) + return; + + subflow->stale = 0; + MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); +} + +bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + if (unlikely(subflow->stale)) { + u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); + + if (subflow->stale_rcv_tstamp == rcv_tstamp) + return false; + + mptcp_subflow_set_active(subflow); + } + return __mptcp_subflow_active(subflow); +} + +/* implement the mptcp packet scheduler; + * returns the subflow that will transmit the next DSS + * additionally updates the rtx timeout + */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; int i, nr_active = 0; struct sock *ssk; + long tout = 0; u64 ratio; u32 pace; - sock_owned_by_me((struct sock *)msk); + sock_owned_by_me(sk); if (__mptcp_check_fallback(msk)) { if (!msk->first) @@ -1431,8 +1453,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) /* re-use last subflow, if the burst allow that */ if (msk->last_snd && msk->snd_burst > 0 && sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { + mptcp_set_timeout(sk); return msk->last_snd; + } /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < 2; ++i) { @@ -1445,6 +1469,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!mptcp_subflow_active(subflow)) continue; + tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) continue; @@ -1460,6 +1485,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) send_info[subflow->backup].ratio = ratio; } } + __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) @@ -1478,12 +1504,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) static void mptcp_push_release(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { - mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } -static void __mptcp_push_pending(struct sock *sk, unsigned int flags) +void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); @@ -1504,25 +1529,20 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - /* try to keep the subflow socket lock across - * consecutive xmit on the same socket + /* First check. If the ssk has changed since + * the last round, release prev_ssk */ if (ssk != prev_ssk && prev_ssk) mptcp_push_release(sk, prev_ssk, &info); if (!ssk) goto out; - if (ssk != prev_ssk || !prev_ssk) - lock_sock(ssk); - - /* keep it simple and always provide a new skb for the - * subflow, even if we will not use it when collapsing - * on the pending one + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock */ - if (!mptcp_alloc_tx_skb(sk, ssk)) { - mptcp_push_release(sk, ssk, &info); - goto out; - } + if (ssk != prev_ssk) + lock_sock(ssk); ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { @@ -1546,18 +1566,19 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) mptcp_push_release(sk, ssk, &info); out: - if (copied) { - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + /* ensure the rtx timer is running */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (copied) __mptcp_check_send_data_fin(sk); - } } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info; + struct mptcp_sendmsg_info info = { + .data_lock_held = true, + }; struct mptcp_data_frag *dfrag; struct sock *xmit_ssk; int len, copied = 0; @@ -1583,13 +1604,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; } - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) - goto out; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) goto out; @@ -1612,7 +1626,6 @@ out: */ __mptcp_update_wmem(sk); if (copied) { - mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_timer_pending(sk)) @@ -1663,7 +1676,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1696,16 +1708,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - __skb_queue_head_init(&skbs); - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) - goto wait_for_memory; - if (!mptcp_wmem_alloc(sk, total_ts)) { - __skb_queue_purge(&skbs); + if (!mptcp_wmem_alloc(sk, total_ts)) goto wait_for_memory; - } - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -1762,7 +1768,7 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, timeo, - test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + test_bit(MPTCP_DATA_READY, &msk->flags), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1770,7 +1776,9 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len, int flags) + size_t len, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; @@ -1790,6 +1798,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } } + if (MPTCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= MPTCP_CMSG_TS; + } + copied += count; if (count < data_len) { @@ -1801,7 +1814,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, if (!(flags & MSG_PEEK)) { /* we will bulk release the skb memory later */ skb->destructor = NULL; - msk->rmem_released += skb->truesize; + WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); } @@ -1920,7 +1933,7 @@ static void __mptcp_update_rmem(struct sock *sk) atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, msk->rmem_released); - msk->rmem_released = 0; + WRITE_ONCE(msk->rmem_released, 0); } static void __mptcp_splice_receive_queue(struct sock *sk) @@ -1953,7 +1966,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - tcp_cleanup_rbuf(ssk, moved); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); @@ -1969,7 +1981,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); - mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@ -1980,7 +1991,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - int copied = 0; + struct scm_timestamping_internal tss; + int copied = 0, cmsg_flags = 0; int target; long timeo; @@ -2002,7 +2014,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2078,11 +2090,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, */ if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); - } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { - /* data to read but mptcp_wait_data() cleared DATA_READY */ - set_bit(MPTCP_DATA_READY, &msk->flags); } + out_err: + if (cmsg_flags && copied >= 0) { + if (cmsg_flags & MPTCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + } + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); @@ -2126,10 +2141,11 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) +static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { + struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; - struct sock *backup = NULL; + int min_stale_count = INT_MAX; sock_owned_by_me((const struct sock *)msk); @@ -2139,14 +2155,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (!mptcp_subflow_active(subflow)) + if (!__mptcp_subflow_active(subflow)) continue; - /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) { - if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) - continue; - return NULL; + /* still data outstanding at TCP level? skip this */ + if (!tcp_rtx_and_write_queues_empty(ssk)) { + mptcp_pm_subflow_chk_stale(msk, ssk); + min_stale_count = min_t(int, min_stale_count, subflow->stale_count); + continue; } if (subflow->backup) { @@ -2155,10 +2171,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; } - return ssk; + if (!pick) + pick = ssk; } - return backup; + if (pick) + return pick; + + /* use backup only if there are no progresses anywhere */ + return min_stale_count > 1 ? backup : NULL; } static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) @@ -2169,6 +2190,50 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) } } +bool __mptcp_retransmit_pending_data(struct sock *sk) +{ + struct mptcp_data_frag *cur, *rtx_head; + struct mptcp_sock *msk = mptcp_sk(sk); + + if (__mptcp_check_fallback(mptcp_sk(sk))) + return false; + + if (tcp_rtx_and_write_queues_empty(sk)) + return false; + + /* the closing socket has some data untransmitted and/or unacked: + * some data in the mptcp rtx queue has not really xmitted yet. + * keep it simple and re-inject the whole mptcp level rtx queue + */ + mptcp_data_lock(sk); + __mptcp_clean_una_wakeup(sk); + rtx_head = mptcp_rtx_head(sk); + if (!rtx_head) { + mptcp_data_unlock(sk); + return false; + } + + /* will accept ack for reijected data before re-sending them */ + if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt)) + msk->recovery_snd_nxt = msk->snd_nxt; + msk->recovery = true; + mptcp_data_unlock(sk); + + msk->first_pending = rtx_head; + msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; + msk->snd_nxt = rtx_head->data_seq; + msk->snd_burst = 0; + + /* be sure to clear the "sent status" on all re-injected fragments */ + list_for_each_entry(cur, &msk->rtx_queue, list) { + if (!cur->already_sent) + break; + cur->already_sent = 0; + } + + return true; +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2181,6 +2246,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(sk); + bool need_push; list_del(&subflow->node); @@ -2192,6 +2258,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk->sk_socket) sock_orphan(ssk); + need_push = __mptcp_retransmit_pending_data(sk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2214,14 +2281,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; - if (ssk == msk->ack_hint) - msk->ack_hint = NULL; - if (ssk == msk->first) msk->first = NULL; if (msk->subflow && ssk == msk->subflow->sk) mptcp_dispose_initial_subflow(msk); + + if (need_push) + __mptcp_push_pending(sk, 0); } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, @@ -2288,13 +2355,14 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(tcp_sk); + slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { tcp_send_active_reset(tcp_sk, GFP_ATOMIC); tcp_set_state(tcp_sk, TCP_CLOSE); } - release_sock(tcp_sk); + unlock_sock_fast(tcp_sk, slow); } inet_sk_state_store(sk, TCP_CLOSE); @@ -2339,11 +2407,8 @@ static void __mptcp_retrans(struct sock *sk) /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { - if (!mptcp_alloc_tx_skb(sk, ssk)) - break; - + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; @@ -2352,11 +2417,12 @@ static void __mptcp_retrans(struct sock *sk) copied += ret; info.sent += ret; } - if (copied) + if (copied) { + dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + } - mptcp_set_timeout(sk, ssk); release_sock(ssk); reset_timer: @@ -2422,17 +2488,17 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); - __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; - msk->rmem_released = 0; + WRITE_ONCE(msk->rmem_released, 0); msk->tx_pending_data = 0; - msk->size_goal_cache = TCP_BASE_MSS; + msk->timer_ival = TCP_RTO_MIN; - msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + msk->recovery = false; mptcp_pm_data_init(msk); @@ -2484,15 +2550,10 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { - sk->sk_forward_alloc += skb->truesize; - kfree_skb(skb); - } } static void mptcp_cancel_work(struct sock *sk) @@ -2522,7 +2583,6 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) tcp_shutdown(ssk, how); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); - mptcp_set_timeout(sk, ssk); tcp_send_ack(ssk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -2773,6 +2833,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); + if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) + WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; @@ -2780,7 +2842,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; - if (mp_opt->mp_capable) { + if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); @@ -2946,6 +3008,11 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } + /* be sure to set the current sk state before tacking actions + * depending on sk_state + */ + if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) + __mptcp_set_connected(sk); if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) __mptcp_clean_una_wakeup(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 385796f0ef19..d3e6fd1615f1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -26,6 +26,15 @@ #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) +#define OPTION_MPTCP_DSS BIT(11) +#define OPTION_MPTCP_FAIL BIT(12) + +#define OPTION_MPTCP_CSUMREQD BIT(13) + +#define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ + OPTION_MPTCP_MPC_ACK) +#define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ + OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -67,6 +76,9 @@ #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_FAIL 12 + +#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -77,8 +89,9 @@ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) -#define MPTCP_CAP_FLAG_MASK (0x3F) +#define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) @@ -109,6 +122,7 @@ #define MPTCP_ERROR_REPORT 8 #define MPTCP_RETRANSMIT 9 #define MPTCP_WORK_SYNC_SETSOCKOPT 10 +#define MPTCP_CONNECTED 11 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -124,33 +138,29 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; - u16 mp_capable : 1, - mp_join : 1, - fastclose : 1, - reset : 1, - dss : 1, - add_addr : 1, - rm_addr : 1, - mp_prio : 1, - echo : 1, - backup : 1; + __sum16 csum; + u16 suboptions; u32 token; u32 nonce; - u64 thmac; - u8 hmac[MPTCPOPT_HMAC_LEN]; - u8 join_id; - u8 use_map:1, + u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, __unused:2; + u8 join_id; + u64 thmac; + u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; - u8 reset_reason:4; - u8 reset_transient:1; + u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -171,8 +181,6 @@ enum mptcp_pm_status { enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, - MPTCP_ADD_ADDR_IPV6, - MPTCP_ADD_ADDR_PORT, MPTCP_RM_ADDR_SIGNAL, }; @@ -188,6 +196,7 @@ struct mptcp_pm_data { bool work_pending; bool accept_addr; bool accept_subflow; + bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; @@ -222,27 +231,30 @@ struct mptcp_sock { struct sock *last_snd; int snd_burst; int old_wspace; + u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; + * recovery related fields are under data_lock + * protection + */ u64 snd_una; u64 wnd_end; unsigned long timer_ival; u32 token; int rmem_released; unsigned long flags; + bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ + bool csum_enabled; spinlock_t join_list_lock; - struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ int tx_pending_data; - int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -290,9 +302,17 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +/* the msk socket don't use the backlog, also account for the bulk + * free memory + */ +static inline int __mptcp_rmem(const struct sock *sk) +{ + return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); +} + static inline int __mptcp_space(const struct sock *sk) { - return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released); + return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -335,11 +355,20 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } +struct csum_pseudo_header { + __be64 data_seq; + __be32 subflow_seq; + __be16 data_len; + __sum16 csum; +}; + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + csum_reqd : 1, + allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; @@ -386,6 +415,8 @@ struct mptcp_subflow_context { u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; + __wsum map_data_csum; + u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, @@ -395,12 +426,16 @@ struct mptcp_subflow_context { pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, + map_csum_reqd : 1, + map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, + send_mp_fail : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ - disposable : 1; /* ctx can be free at ulp release time */ + disposable : 1, /* ctx can be free at ulp release time */ + stale : 1; /* unable to snd/rcv data, do not use for xmit */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -412,11 +447,13 @@ struct mptcp_subflow_context { u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; + u8 stale_count; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ - u32 setsockopt_seq; + u32 setsockopt_seq; + u32 stale_rcv_tstamp; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ @@ -522,29 +559,34 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); } -int mptcp_is_enabled(struct net *net); -unsigned int mptcp_get_add_addr_timeout(struct net *net); +int mptcp_is_enabled(const struct net *net); +unsigned int mptcp_get_add_addr_timeout(const struct net *net); +int mptcp_is_checksum_enabled(const struct net *net); +int mptcp_allow_join_id0(const struct net *net); +unsigned int mptcp_stale_loss_cnt(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); +bool __mptcp_retransmit_pending_data(struct sock *sk); +void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); +void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote, - u8 flags, int ifindex); + const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); -static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -556,6 +598,10 @@ static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } +void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); + +bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { @@ -567,6 +613,19 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } +static inline bool mptcp_has_another_subflow(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + mptcp_for_each_subflow(msk, tmp) { + if (tmp != subflow) + return true; + } + + return false; +} + void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); @@ -575,10 +634,12 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void __mptcp_set_connected(struct sock *sk); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && @@ -593,6 +654,14 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); +static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) +{ + if (use_64bit) + return cur_seq; + + return __mptcp_expand_seq(old_seq, cur_seq); +} void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); @@ -626,6 +695,8 @@ static inline void mptcp_write_space(struct sock *sk) void mptcp_destroy_common(struct mptcp_sock *msk); +#define MPTCP_TOKEN_MAX_RETRIES 4 + void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { @@ -649,6 +720,8 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); +void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -667,6 +740,7 @@ void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, u8 bkup); +void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * @@ -675,6 +749,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, struct mptcp_addr_info *addr); +int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, + u8 *flags, int *ifindex); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, @@ -689,22 +765,18 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); + return READ_ONCE(msk->pm.addr_signal) & + (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } -static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) -{ - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); -} - -static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } -static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) @@ -735,8 +807,10 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo, bool *port); +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, + unsigned int opt_size, unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo, + bool *port, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); @@ -752,9 +826,6 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); - void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_all(struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a79798189599..8c03afac5ca0 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -140,8 +140,34 @@ static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } +static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + sock_set_timestamp(sk, optname, !!val); + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) + sockptr_t optval, + unsigned int optlen) { int val, ret; @@ -164,11 +190,60 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; } +static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, + int optname, + sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct so_timestamping timestamping; + int ret; + + if (optlen == sizeof(timestamping)) { + if (copy_from_sockptr(×tamping, optval, + sizeof(timestamping))) + return -EFAULT; + } else if (optlen == sizeof(int)) { + memset(×tamping, 0, sizeof(timestamping)); + + if (copy_from_sockptr(×tamping.flags, optval, sizeof(int))) + return -EFAULT; + } else { + return -EINVAL; + } + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + KERNEL_SOCKPTR(×tamping), + sizeof(timestamping)); + if (ret) + return ret; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + sock_set_timestamping(sk, optname, timestamping); + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + + return 0; +} + static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, unsigned int optlen) { @@ -251,9 +326,26 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: - return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + return mptcp_setsockopt_sol_socket_int(msk, optname, optval, + optlen); + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + return mptcp_setsockopt_sol_socket_timestamping(msk, optname, + optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + /* No need to copy: only relevant for msk */ + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: @@ -267,7 +359,24 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return 0; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); + /* SO_OOBINLINE is not supported, let's avoid the related mess + * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + * + * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + * + * SO_PEEK_OFF is unsupported, as it is for plain TCP + * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows + * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + * + * SO_ZEROCOPY is currently unsupported, TODO in sndmsg + * SO_TXTIME is currently unsupported + */ + + return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, @@ -299,72 +408,6 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, static bool mptcp_supported_sockopt(int level, int optname) { - if (level == SOL_SOCKET) { - switch (optname) { - case SO_DEBUG: - case SO_REUSEPORT: - case SO_REUSEADDR: - - /* the following ones need a better implementation, - * but are quite common we want to preserve them - */ - case SO_BINDTODEVICE: - case SO_SNDBUF: - case SO_SNDBUFFORCE: - case SO_RCVBUF: - case SO_RCVBUFFORCE: - case SO_KEEPALIVE: - case SO_PRIORITY: - case SO_LINGER: - case SO_TIMESTAMP_OLD: - case SO_TIMESTAMP_NEW: - case SO_TIMESTAMPNS_OLD: - case SO_TIMESTAMPNS_NEW: - case SO_TIMESTAMPING_OLD: - case SO_TIMESTAMPING_NEW: - case SO_RCVLOWAT: - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - case SO_MARK: - case SO_INCOMING_CPU: - case SO_BINDTOIFINDEX: - case SO_BUSY_POLL: - case SO_PREFER_BUSY_POLL: - case SO_BUSY_POLL_BUDGET: - - /* next ones are no-op for plain TCP */ - case SO_NO_CHECK: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_BSDCOMPAT: - case SO_PASSCRED: - case SO_PASSSEC: - case SO_RXQ_OVFL: - case SO_WIFI_STATUS: - case SO_NOFCS: - case SO_SELECT_ERR_QUEUE: - return true; - } - - /* SO_OOBINLINE is not supported, let's avoid the related mess */ - /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, - * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, - * we must be careful with subflows - */ - /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks - * explicitly the sk_protocol field - */ - /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ - /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ - /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, - * but likely needs careful design - */ - /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ - /* SO_TXTIME is currently unsupported */ - return false; - } if (level == SOL_IP) { switch (optname) { /* should work fine */ @@ -574,12 +617,12 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); - if (!mptcp_supported_sockopt(level, optname)) - return -ENOPROTOOPT; - if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be1de4084196..1de7ce883c37 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -108,6 +108,8 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); + subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } @@ -139,6 +141,7 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; + bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); @@ -150,19 +153,21 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); - if (mp_opt.mp_capable) { + opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); - if (mp_opt.mp_join) + if (opt_mp_join) return 0; - } else if (mp_opt.mp_join) { + } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } - if (mp_opt.mp_capable && listener->request_mptcp) { - int err, retries = 4; + if (opt_mp_capable && listener->request_mptcp) { + int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: @@ -192,7 +197,7 @@ again: else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); - } else if (mp_opt.mp_join && listener->request_mptcp) { + } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; @@ -212,11 +217,6 @@ again: ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { - sock_put((struct sock *)subflow_req->msk); - mptcp_token_destroy_request(req); - tcp_request_sock_ops.destructor(req); - subflow_req->msk = NULL; - subflow_req->mp_join = 0; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); return -EPERM; } @@ -228,6 +228,8 @@ again: if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); + else + return -EPERM; } pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, @@ -244,15 +246,18 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; + bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); - if (mp_opt.mp_capable && mp_opt.mp_join) + opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + if (opt_mp_capable && opt_mp_join) return -EINVAL; - if (mp_opt.mp_capable && listener->request_mptcp) { + if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; @@ -263,13 +268,11 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; - } else if (mp_opt.mp_join && listener->request_mptcp) { + } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; - if (mptcp_can_accept_new_subflow(subflow_req->msk)) - subflow_req->mp_join = 1; - + subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } @@ -371,6 +374,24 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } +void __mptcp_set_connected(struct sock *sk) +{ + if (sk->sk_state == TCP_SYN_SENT) { + inet_sk_state_store(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + } +} + +static void mptcp_set_connected(struct sock *sk) +{ + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_set_connected(sk); + else + set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -379,11 +400,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } - /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; @@ -394,9 +410,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { - if (!mp_opt.mp_capable) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); @@ -404,6 +420,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } + if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) + WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); + if (mp_opt.deny_join_id0) + WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; @@ -411,18 +431,21 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } + subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, - subflow->thmac, subflow->remote_nonce); + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + subflow, subflow->thmac, subflow->remote_nonce, + subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); @@ -430,15 +453,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto do_reset; } + if (!mptcp_finish_join(sk)) + goto do_reset; + subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (!mptcp_finish_join(sk)) - goto do_reset; - subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); @@ -451,6 +474,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_set_connected(parent); } return; @@ -558,6 +582,7 @@ static void mptcp_sock_destruct(struct sock *sk) static void mptcp_force_close(struct sock *sk) { + /* the msk is not yet exposed to user-space */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); } @@ -616,10 +641,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* After child creation we must look for 'mp_capable' even when options + /* After child creation we must look for MPC even when options * are not parsed */ - mp_opt.mp_capable = 0; + mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); @@ -638,8 +663,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(skb, &mp_opt); - if (!mp_opt.mp_capable) { + mptcp_get_options(sk, skb, &mp_opt); + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { fallback = true; goto create_child; } @@ -648,8 +673,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(skb, &mp_opt); - if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || + mptcp_get_options(sk, skb, &mp_opt); + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || + !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; @@ -706,7 +732,7 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.mp_capable) + if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -775,15 +801,6 @@ enum mapping_status { MAPPING_DUMMY }; -static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) -{ - if ((u32)seq == (u32)old_seq) - return old_seq; - - /* Assume map covers data not mapped yet. */ - return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); -} - static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", @@ -824,10 +841,94 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) return true; } +static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, + bool csum_reqd) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct csum_pseudo_header header; + u32 offset, seq, delta; + __wsum csum; + int len; + + if (!csum_reqd) + return MAPPING_OK; + + /* mapping already validated on previous traversal */ + if (subflow->map_csum_len == subflow->map_data_len) + return MAPPING_OK; + + /* traverse the receive queue, ensuring it contains a full + * DSS mapping and accumulating the related csum. + * Preserve the accoumlate csum across multiple calls, to compute + * the csum only once + */ + delta = subflow->map_data_len - subflow->map_csum_len; + for (;;) { + seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; + offset = seq - TCP_SKB_CB(skb)->seq; + + /* if the current skb has not been accounted yet, csum its contents + * up to the amount covered by the current DSS + */ + if (offset < skb->len) { + __wsum csum; + + len = min(skb->len - offset, delta); + csum = skb_checksum(skb, offset, len, 0); + subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, + subflow->map_csum_len); + + delta -= len; + subflow->map_csum_len += len; + } + if (delta == 0) + break; + + if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { + /* if this subflow is closed, the partial mapping + * will be never completed; flush the pending skbs, so + * that subflow_sched_work_if_closed() can kick in + */ + if (unlikely(ssk->sk_state == TCP_CLOSE)) + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + + /* not enough data to validate the csum */ + return MAPPING_EMPTY; + } + + /* the DSS mapping for next skbs will be validated later, + * when a get_mapping_status call will process such skb + */ + skb = skb->next; + } + + /* note that 'map_data_len' accounts only for the carried data, does + * not include the eventual seq increment due to the data fin, + * while the pseudo header requires the original DSS data len, + * including that + */ + header.data_seq = cpu_to_be64(subflow->map_seq); + header.subflow_seq = htonl(subflow->map_subflow_seq); + header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); + if (unlikely(csum_fold(csum))) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); + return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + } + + return MAPPING_OK; +} + static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -907,22 +1008,17 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len--; } - if (!mpext->dsn64) { - map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, - mpext->data_seq); - pr_debug("expanded seq=%llu", subflow->map_seq); - } else { - map_seq = mpext->data_seq; - } + map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && - subflow->map_data_len == data_len) { + subflow->map_data_len == data_len && + subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + goto validate_csum; } /* If this skb data are fully covered by the current mapping, @@ -934,27 +1030,41 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } /* will validate the next map after consuming the current one */ - return MAPPING_OK; + goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_csum_reqd = mpext->csum_reqd; + subflow->map_csum_len = 0; + subflow->map_data_csum = csum_unfold(mpext->csum); + + /* Cfr RFC 8684 Section 3.3.0 */ + if (unlikely(subflow->map_csum_reqd != csum_reqd)) + return MAPPING_INVALID; + + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, - subflow->map_data_len); + subflow->map_data_len, subflow->map_csum_reqd, + subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ - if (!validate_mapping(ssk, skb)) + if (!validate_mapping(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; + } skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + +validate_csum: + return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, @@ -1055,6 +1165,20 @@ no_data: fallback: /* RFC 8684 section 3.7. */ + if (subflow->send_mp_fail) { + if (mptcp_has_another_subflow(ssk)) { + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + } + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + tcp_send_active_reset(ssk, GFP_ATOMIC); + WRITE_ONCE(subflow->data_avail, 0); + return true; + } + if (subflow->mp_join || subflow->fully_established) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers @@ -1137,7 +1261,7 @@ void __mptcp_error_report(struct sock *sk) /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); break; } } @@ -1253,8 +1377,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote, - u8 flags, int ifindex) + const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; @@ -1265,6 +1388,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, struct sock *ssk; u32 remote_token; int addrlen; + int ifindex; + u8 flags; int err; if (!mptcp_is_fully_established(sk)) @@ -1288,6 +1413,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, local_id = err; } + mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, + &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -1489,10 +1616,7 @@ static void subflow_state_change(struct sock *sk) mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } + mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index abe0fd099746..37127781aee9 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -37,7 +37,21 @@ static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net) { - u32 i = skb_get_hash(skb) ^ net_hash_mix(net); + static u32 mptcp_join_hash_secret __read_mostly; + struct tcphdr *th = tcp_hdr(skb); + u32 seq, i; + + net_get_random_once(&mptcp_join_hash_secret, + sizeof(mptcp_join_hash_secret)); + + if (th->syn) + seq = TCP_SKB_CB(skb)->seq; + else + seq = TCP_SKB_CB(skb)->seq - 1; + + i = jhash_3words(seq, net_hash_mix(net), + (__force __u32)th->source << 16 | (__force __u32)th->dest, + mptcp_join_hash_secret); return i % ARRAY_SIZE(join_entries); } diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 8f0270a780ce..a98e554b034f 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -33,7 +33,6 @@ #include <net/mptcp.h> #include "protocol.h" -#define TOKEN_MAX_RETRIES 4 #define TOKEN_MAX_CHAIN_LEN 4 struct token_bucket { @@ -153,12 +152,9 @@ int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - int retries = TOKEN_MAX_RETRIES; + int retries = MPTCP_TOKEN_MAX_RETRIES; struct token_bucket *bucket; - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); - again: mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, &subflow->idsn); @@ -172,6 +168,9 @@ again: goto again; } + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig index 93309081f5a4..ea1dd32b6b1f 100644 --- a/net/ncsi/Kconfig +++ b/net/ncsi/Kconfig @@ -17,3 +17,9 @@ config NCSI_OEM_CMD_GET_MAC help This allows to get MAC address from NCSI firmware and set them back to controller. +config NCSI_OEM_CMD_KEEP_PHY + bool "Keep PHY Link up" + depends on NET_NCSI + help + This allows to keep PHY link up and prevents any channel resets during + the host load. diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 49031f804276..03757e76bb6b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -78,6 +78,10 @@ enum { /* OEM Vendor Manufacture ID */ #define NCSI_OEM_MFR_MLX_ID 0x8119 #define NCSI_OEM_MFR_BCM_ID 0x113d +#define NCSI_OEM_MFR_INTEL_ID 0x157 +/* Intel specific OEM command */ +#define NCSI_OEM_INTEL_CMD_GMA 0x06 /* CMD ID for Get MAC */ +#define NCSI_OEM_INTEL_CMD_KEEP_PHY 0x20 /* CMD ID for Keep PHY up */ /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ /* Mellanox specific OEM Command */ @@ -86,6 +90,8 @@ enum { #define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ #define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ +#define NCSI_OEM_INTEL_CMD_GMA_LEN 5 +#define NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN 7 #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 #define NCSI_OEM_MLX_CMD_SMAF_LEN 60 @@ -95,6 +101,7 @@ enum { /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 +#define INTEL_MAC_ADDR_OFFSET 1 struct ncsi_channel_version { @@ -238,7 +245,7 @@ struct ncsi_package { struct ncsi_dev_priv *ndp; /* NCSI device */ spinlock_t lock; /* Protect the package */ unsigned int channel_num; /* Number of channels */ - struct list_head channels; /* List of chanels */ + struct list_head channels; /* List of channels */ struct list_head node; /* Form list of packages */ bool multi_channel; /* Enable multiple channels */ @@ -271,6 +278,7 @@ enum { ncsi_dev_state_probe_mlx_gma, ncsi_dev_state_probe_mlx_smaf, ncsi_dev_state_probe_cis, + ncsi_dev_state_probe_keep_phy, ncsi_dev_state_probe_gvi, ncsi_dev_state_probe_gc, ncsi_dev_state_probe_gls, @@ -339,7 +347,7 @@ struct ncsi_cmd_arg { unsigned char type; /* Command in the NCSI packet */ unsigned char id; /* Request ID (sequence number) */ unsigned char package; /* Destination package ID */ - unsigned char channel; /* Detination channel ID or 0x1f */ + unsigned char channel; /* Destination channel ID or 0x1f */ unsigned short payload; /* Command packet payload length */ unsigned int req_flags; /* NCSI request properties */ union { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index ffff8da707b8..7121ce2a47c0 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -627,7 +627,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } -/* Find an outstanding VLAN tag and constuct a "Set VLAN Filter - Enable" +/* Find an outstanding VLAN tag and construct a "Set VLAN Filter - Enable" * packet. */ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, @@ -689,6 +689,35 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) + +static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca) +{ + unsigned char data[NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN]; + int ret = 0; + + nca->payload = NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN; + + memset(data, 0, NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN); + *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); + + data[4] = NCSI_OEM_INTEL_CMD_KEEP_PHY; + + /* PHY Link up attribute */ + data[6] = 0x1; + + nca->data = data; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + return ret; +} + +#endif + #if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) /* NCSI OEM Command APIs */ @@ -700,7 +729,7 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN; memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN); - *(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID); + *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_BCM_ID); data[5] = NCSI_OEM_BCM_CMD_GMA; nca->data = data; @@ -724,7 +753,7 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN; memset(&u, 0, sizeof(u)); - u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID); u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA; u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM; @@ -747,7 +776,7 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) int ret = 0; memset(&u, 0, sizeof(u)); - u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID); + u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID); u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF; u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM; memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET], @@ -766,13 +795,36 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_intel(struct ncsi_cmd_arg *nca) +{ + unsigned char data[NCSI_OEM_INTEL_CMD_GMA_LEN]; + int ret = 0; + + nca->payload = NCSI_OEM_INTEL_CMD_GMA_LEN; + + memset(data, 0, NCSI_OEM_INTEL_CMD_GMA_LEN); + *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); + data[4] = NCSI_OEM_INTEL_CMD_GMA; + + nca->data = data; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, - { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }, + { NCSI_OEM_MFR_INTEL_ID, ncsi_oem_gma_handler_intel } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) @@ -1392,7 +1444,23 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) } nd->state = ncsi_dev_state_probe_gvi; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)) + nd->state = ncsi_dev_state_probe_keep_phy; + break; +#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) + case ncsi_dev_state_probe_keep_phy: + ndp->pending_req_num = 1; + + nca.type = NCSI_PKT_CMD_OEM; + nca.package = ndp->active_package->id; + nca.channel = 0; + ret = ncsi_oem_keep_phy_intel(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_gvi; break; +#endif /* CONFIG_NCSI_OEM_CMD_KEEP_PHY */ case ncsi_dev_state_probe_gvi: case ncsi_dev_state_probe_gc: case ncsi_dev_state_probe_gls: diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 80938b338fee..ba66c7dc3a21 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -178,6 +178,12 @@ struct ncsi_rsp_oem_bcm_pkt { unsigned char data[]; /* Cmd specific Data */ }; +/* Intel Response Data */ +struct ncsi_rsp_oem_intel_pkt { + unsigned char cmd; /* OEM Command ID */ + unsigned char data[]; /* Cmd specific Data */ +}; + /* Get Link Status */ struct ncsi_rsp_gls_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 888ccc2d4e34..6447a09932f5 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -403,7 +403,7 @@ static int ncsi_rsp_handler_ev(struct ncsi_request *nr) /* Update to VLAN mode */ cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd); ncm->enable = 1; - ncm->data[0] = ntohl(cmd->mode); + ncm->data[0] = ntohl((__force __be32)cmd->mode); return 0; } @@ -699,12 +699,61 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) return 0; } +/* Response handler for Intel command Get Mac Address */ +static int ncsi_rsp_handler_oem_intel_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[INTEL_MAC_ADDR_OFFSET], ETH_ALEN); + /* Increase mac address by 1 for BMC's address */ + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, + "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + +/* Response handler for Intel card */ +static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr) +{ + struct ncsi_rsp_oem_intel_pkt *intel; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + intel = (struct ncsi_rsp_oem_intel_pkt *)(rsp->data); + + if (intel->cmd == NCSI_OEM_INTEL_CMD_GMA) + return ncsi_rsp_handler_oem_intel_gma(nr); + + return 0; +} + static struct ncsi_rsp_oem_handler { unsigned int mfr_id; int (*handler)(struct ncsi_request *nr); } ncsi_rsp_oem_handlers[] = { { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx }, - { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm } + { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }, + { NCSI_OEM_MFR_INTEL_ID, ncsi_rsp_handler_oem_intel } }; /* Response handler for OEM command */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 56a2531a3402..54395266339d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -19,6 +19,16 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_NETLINK_HOOK + tristate "Netfilter base hook dump support" + depends on NETFILTER_ADVANCED + depends on NF_TABLES + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + to list the base netfilter hooks via NFNETLINK. + This is helpful for debugging. + config NETFILTER_NETLINK_ACCT tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED @@ -816,7 +826,7 @@ config NETFILTER_XT_TARGET_CLASSIFY the priority of a packet. Some qdiscs can use this value for classification, among these are: - atm, cbq, dsmark, pfifo_fast, htb, prio + atm, cbq, dsmark, pfifo_fast, htb, prio To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e80e010354b1..aab20e575ecd 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o +obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o @@ -73,7 +74,7 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o @@ -211,3 +212,6 @@ obj-$(CONFIG_IP_SET) += ipset/ # IPVS obj-$(CONFIG_IP_VS) += ipvs/ + +# lwtunnel +obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index de2d20c37cda..16ae92054baa 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1685,8 +1685,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1738,8 +1738,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1783,7 +1782,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1794,7 +1793,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1859,7 +1858,6 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1885,12 +1883,7 @@ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1945,12 +1938,7 @@ static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1971,7 +1959,6 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -1990,12 +1977,7 @@ static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2014,7 +1996,6 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2038,12 +2019,7 @@ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2065,7 +2041,6 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2091,12 +2066,7 @@ static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index d1bef23fd4f5..dd30c03d5a23 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -132,8 +132,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -144,6 +147,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + /* 64bit division is not allowed on 32bit */ + if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) { ip = ntohl(h->next.ip); e.ip = htonl(ip); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 18346d18aa16..153de3457423 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -121,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; + if (e.mark == 0 && e.ip == 0) + return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { @@ -133,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (e.mark == 0 && ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -143,6 +148,9 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } + if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e1ca11196515..7303138e46be 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -173,6 +173,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ab179e064597..334fb1ad0e86 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -180,6 +180,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 8f075b44cf64..7df94f437f60 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -253,6 +253,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c1a11f041ac6..1422739d9aa2 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -140,7 +140,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,6 +188,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); do { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index ddd51c2e1cb3..9810f5bf63f5 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,6 +256,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; if (retried) ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 6532f0505e66..3d09eefe998a 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -168,7 +168,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; + u64 n = 0, m = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +245,19 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index ec1564a1cb5a..09cf72eb37f8 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -158,7 +158,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; + u64 n = 0; bool with_ports = false; u8 cidr; int ret; @@ -235,6 +236,14 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); + n++; + } while (ipn++ < ip_to); + + if (n*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0e91d1e82f1c..19bcdb3141f6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -182,7 +182,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, ipn; + u64 n = 0, m = 0; bool with_ports = false; int ret; @@ -284,6 +285,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index d61886874940..271da8447b29 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -318,7 +318,7 @@ config IP_VS_MH_TAB_INDEX comment 'IPVS application helper' config IP_VS_FTP - tristate "FTP protocol helper" + tristate "FTP protocol helper" depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ NF_CONNTRACK_FTP select IP_VS_NFCT diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e0befcf8113a..94e18fb9690d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -21,7 +21,6 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> @@ -55,8 +54,6 @@ #include "nf_internals.h" -extern unsigned int nf_conntrack_net_id; - __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -68,26 +65,21 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; - u32 last_bucket; + u32 next_bucket; bool exiting; bool early_drop; - long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; -/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 128u -/* upper bound of full table scan */ -#define GC_MAX_SCAN_JIFFIES (16u * HZ) -/* desired ratio of entries found to be expired */ -#define GC_EVICT_RATIO 50u +#define GC_SCAN_INTERVAL (120u * HZ) +#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) -static struct conntrack_gc_work conntrack_gc_work; +#define MAX_CHAINLEN 64u -extern unsigned int nf_conntrack_net_id; +static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { @@ -153,7 +145,15 @@ static void nf_conntrack_all_lock(void) spin_lock(&nf_conntrack_locks_all_lock); - nf_conntrack_locks_all = true; + /* For nf_contrack_locks_all, only the latest time when another + * CPU will see an update is controlled, by the "release" of the + * spin_lock below. + * The earliest time is not controlled, an thus KCSAN could detect + * a race when nf_conntract_lock() reads the variable. + * WRITE_ONCE() is used to ensure the compiler will not + * optimize the write. + */ + WRITE_ONCE(nf_conntrack_locks_all, true); for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_lock(&nf_conntrack_locks[i]); @@ -185,25 +185,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; -static unsigned int nf_conntrack_hash_rnd __read_mostly; +static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, const struct net *net) { - unsigned int n; - u32 seed; + struct { + struct nf_conntrack_man src; + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u16 proto; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, so we hash everything up to the - * destination ports (which is a multiple of 4) and treat the last - * three bytes manually. - */ - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, seed ^ - (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + memset(&combined, 0, sizeof(combined)); + + /* The direction must be ignored, so handle usable members manually. */ + combined.src = tuple->src; + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(net); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.proto = tuple->dst.protonum; + + return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) @@ -666,8 +672,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) return false; tstamp = nf_conn_tstamp_find(ct); - if (tstamp && tstamp->stop == 0) + if (tstamp) { + s32 timeout = ct->timeout - nfct_time_stamp; + tstamp->stop = ktime_get_real_ns(); + if (timeout < 0) + tstamp->stop -= jiffies_to_nsecs(-timeout); + } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { @@ -831,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int chainlen = 0; unsigned int sequence; + int err = -EEXIST; zone = nf_ct_zone(ct); @@ -845,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } + + chainlen = 0; + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } smp_wmb(); /* The caller holds a reference to this object */ @@ -863,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; - +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); - return -EEXIST; + return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); @@ -1080,6 +1104,7 @@ int __nf_conntrack_confirm(struct sk_buff *skb) { const struct nf_conntrack_zone *zone; + unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1087,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - unsigned int sequence; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); @@ -1147,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + chainlen = 0; + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) { +chaintoolong: + nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, chaintoolong); + NF_CT_STAT_INC(net, insert_failed); + ret = NF_DROP; + goto dying; + } + } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -1354,17 +1391,13 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) static void gc_worker(struct work_struct *work) { - unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); - unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned int nf_conntrack_max95 = 0; + unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; + unsigned int i, hashsz, nf_conntrack_max95 = 0; + unsigned long next_run = GC_SCAN_INTERVAL; struct conntrack_gc_work *gc_work; - unsigned int ratio, scanned = 0; - unsigned long next_run; - gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; - i = gc_work->last_bucket; + i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; @@ -1372,15 +1405,15 @@ static void gc_worker(struct work_struct *work) struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int hashsz; struct nf_conn *tmp; - i++; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); - if (i >= hashsz) - i = 0; + if (i >= hashsz) { + rcu_read_unlock(); + break; + } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; @@ -1388,7 +1421,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - scanned++; if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); continue; @@ -1396,7 +1428,6 @@ static void gc_worker(struct work_struct *work) if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); - expired_count++; continue; } @@ -1404,7 +1435,7 @@ static void gc_worker(struct work_struct *work) continue; net = nf_ct_net(tmp); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; @@ -1429,7 +1460,14 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched(); - } while (++buckets < goal); + i++; + + if (time_after(jiffies, end_time) && i < hashsz) { + gc_work->next_bucket = i; + next_run = 0; + break; + } + } while (i < hashsz); if (gc_work->exiting) return; @@ -1440,40 +1478,17 @@ static void gc_worker(struct work_struct *work) * * This worker is only here to reap expired entries when system went * idle after a busy period. - * - * The heuristics below are supposed to balance conflicting goals: - * - * 1. Minimize time until we notice a stale entry - * 2. Maximize scan intervals to not waste cycles - * - * Normally, expire ratio will be close to 0. - * - * As soon as a sizeable fraction of the entries have expired - * increase scan frequency. */ - ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio > GC_EVICT_RATIO) { - gc_work->next_gc_run = min_interval; - } else { - unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; - - BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); - - gc_work->next_gc_run += min_interval; - if (gc_work->next_gc_run > max) - gc_work->next_gc_run = max; + if (next_run) { + gc_work->early_drop = false; + gc_work->next_bucket = 0; } - - next_run = gc_work->next_gc_run; - gc_work->last_bucket = i; - gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = HZ; gc_work->exiting = false; } @@ -1484,7 +1499,7 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; @@ -1556,7 +1571,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); smp_mb__before_atomic(); atomic_dec(&cnet->count); @@ -1614,7 +1629,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, GFP_ATOMIC); local_bh_disable(); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); @@ -2317,7 +2332,7 @@ __nf_ct_unconfirmed_destroy(struct net *net) void nf_ct_unconfirmed_destroy(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); @@ -2333,7 +2348,7 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct iter_data d; might_sleep(); @@ -2367,7 +2382,7 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) down_read(&net_rwsem); for_each_net(net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; @@ -2449,7 +2464,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); nf_ct_iterate_cleanup(kill_all, net, 0, 0); if (atomic_read(&cnet->count) != 0) @@ -2461,7 +2476,6 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_conntrack_proto_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); @@ -2613,26 +2627,24 @@ int nf_conntrack_init_start(void) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { - /* Idea from tcp.c: use 1/16384 of memory. - * On i386: 32MB machine has 512 buckets. - * >= 1GB machines have 16384 buckets. - * >= 4GB machines have 65536 buckets. - */ nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) - nf_conntrack_htable_size = 65536; + if (BITS_PER_LONG >= 64 && + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; + nf_conntrack_htable_size = 65536; - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + if (nf_conntrack_htable_size < 1024) + nf_conntrack_htable_size = 1024; + /* Use a max. factor of one by default to keep the average + * hash chain length at 2 entries. Each entry has to be added + * twice (once for original direction, once for reply). + * When a table size is given we use the old value of 8 to + * avoid implicit reduction of the max entries setting. + */ + max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); @@ -2733,7 +2745,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; int cpu; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 759d87aef95f..41768ff19464 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,8 +27,6 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) @@ -132,58 +130,77 @@ static void ecache_work(struct work_struct *work) schedule_delayed_work(&cnet->ecache_dwork, delay); } -int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, - u32 portid, int report) +static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, + const unsigned int events, + const unsigned long missed, + const struct nf_ct_event *item) { - int ret = 0; - struct net *net = nf_ct_net(ct); + struct nf_conn *ct = item->ct; + struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; - struct nf_conntrack_ecache *e; + int ret; + + if (!((events | missed) & e->ctmask)) + return 0; rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (!notify) - goto out_unlock; + if (!notify) { + rcu_read_unlock(); + return 0; + } + + ret = notify->ct_event(events | missed, item); + rcu_read_unlock(); + + if (likely(ret >= 0 && missed == 0)) + return 0; + + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + + return ret; +} + +int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, + u32 portid, int report) +{ + struct nf_conntrack_ecache *e; + struct nf_ct_event item; + unsigned long missed; + int ret; + + if (!nf_ct_is_confirmed(ct)) + return 0; e = nf_ct_ecache_find(ct); if (!e) - goto out_unlock; + return 0; - if (nf_ct_is_confirmed(ct)) { - struct nf_ct_event item = { - .ct = ct, - .portid = e->portid ? e->portid : portid, - .report = report - }; - /* This is a resent of a destroy event? If so, skip missed */ - unsigned long missed = e->portid ? 0 : e->missed; - - if (!((eventmask | missed) & e->ctmask)) - goto out_unlock; - - ret = notify->fcn(eventmask | missed, &item); - if (unlikely(ret < 0 || missed)) { - spin_lock_bh(&ct->lock); - if (ret < 0) { - /* This is a destroy event that has been - * triggered by a process, we store the PORTID - * to include it in the retransmission. - */ - if (eventmask & (1 << IPCT_DESTROY)) { - if (e->portid == 0 && portid != 0) - e->portid = portid; - e->state = NFCT_ECACHE_DESTROY_FAIL; - } else { - e->missed |= eventmask; - } - } else { - e->missed &= ~missed; - } - spin_unlock_bh(&ct->lock); - } + memset(&item, 0, sizeof(item)); + + item.ct = ct; + item.portid = e->portid ? e->portid : portid; + item.report = report; + + /* This is a resent of a destroy event? If so, skip missed */ + missed = e->portid ? 0 : e->missed; + + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); + if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { + /* This is a destroy event that has been triggered by a process, + * we store the PORTID to include it in the retransmission. + */ + if (e->portid == 0 && portid != 0) + e->portid = portid; + e->state = NFCT_ECACHE_DESTROY_FAIL; } -out_unlock: - rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); @@ -192,53 +209,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - unsigned long events, missed; - struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; struct nf_ct_event item; - int ret; - - rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (notify == NULL) - goto out_unlock; + unsigned long events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; + return; e = nf_ct_ecache_find(ct); if (e == NULL) - goto out_unlock; + return; events = xchg(&e->cache, 0); - /* We make a copy of the missed event cache without taking - * the lock, thus we may send missed events twice. However, - * this does not harm and it happens very rarely. */ - missed = e->missed; - - if (!((events | missed) & e->ctmask)) - goto out_unlock; - item.ct = ct; item.portid = 0; item.report = 0; - ret = notify->fcn(events | missed, &item); - - if (likely(ret == 0 && !missed)) - goto out_unlock; - - spin_lock_bh(&ct->lock); - if (ret < 0) - e->missed |= events; - else - e->missed &= ~missed; - spin_unlock_bh(&ct->lock); - -out_unlock: - rcu_read_unlock(); + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. + */ + __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); @@ -248,11 +240,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, { struct net *net = nf_ct_exp_net(exp); - struct nf_exp_event_notifier *notify; + struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_expect_event_cb); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; @@ -266,89 +258,38 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, .portid = portid, .report = report }; - notify->fcn(1 << event, &item); + notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } -int nf_conntrack_register_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_register_notifier(struct net *net, + const struct nf_ct_event_notifier *new) { - int ret; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; - } + WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); - ret = 0; - -out_unlock: mutex_unlock(&nf_ct_ecache_mutex); - return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -void nf_conntrack_unregister_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_unregister_notifier(struct net *net) { - struct nf_ct_event_notifier *notify; - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ + /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_ct_expect_register_notifier(struct net *net, - struct nf_exp_event_notifier *new) -{ - int ret; - struct nf_exp_event_notifier *notify; - - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; - } - rcu_assign_pointer(net->ct.nf_expect_event_cb, new); - ret = 0; - -out_unlock: - mutex_unlock(&nf_ct_ecache_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); - -void nf_ct_expect_unregister_notifier(struct net *net, - struct nf_exp_event_notifier *new) -{ - struct nf_exp_event_notifier *notify; - - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); - RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); - mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ -} -EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); - void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache_dwork)) { @@ -371,7 +312,7 @@ static const struct nf_ct_ext_type event_extend = { void nf_conntrack_ecache_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; cnet->ct_net = &net->ct; @@ -380,7 +321,7 @@ void nf_conntrack_ecache_pernet_init(struct net *net) void nf_conntrack_ecache_pernet_fini(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache_dwork); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index efdd391b3f72..f562eeef4234 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -17,7 +17,7 @@ #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> @@ -41,9 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static unsigned int nf_ct_expect_hashrnd __read_mostly; - -extern unsigned int nf_conntrack_net_id; +static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -58,7 +56,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del_rcu(&exp->hnode); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); @@ -83,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t) static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash, seed; + struct { + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u8 l3num; + u8 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; + u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + memset(&combined, 0, sizeof(combined)); + + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(n); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.l3num = tuple->src.l3num; + combined.protonum = tuple->dst.protonum; - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ seed); + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } @@ -123,7 +132,7 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; @@ -164,7 +173,7 @@ nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; @@ -397,7 +406,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); @@ -468,7 +477,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - cnet = net_generic(net, nf_conntrack_net_id); + cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index aafaff00baf1..2eb31ffb3d14 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -194,7 +194,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ pr_debug("nf_ct_h323: separate TPKT header indicates " - "there will be TPKT data of %hu bytes\n", + "there will be TPKT data of %d bytes\n", tpktlen - 4); info->tpkt_len[dir] = tpktlen - 4; return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ac396cc8bfae..ae4488a13c70 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -43,8 +43,6 @@ MODULE_PARM_DESC(nf_conntrack_helper, static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; -extern unsigned int nf_conntrack_net_id; - /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -214,7 +212,7 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); static struct nf_conntrack_helper * nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (!cnet->sysctl_auto_assign_helper) { if (cnet->auto_assign_helper_warned) @@ -560,7 +558,7 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8690fc07030f..f1e5443fe7c7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb, if (!help) return 0; + rcu_read_lock(); helper = rcu_dereference(help->helper); if (!helper) goto out; @@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb, nla_nest_end(skb, nest_helper); out: + rcu_read_unlock(); return 0; nla_put_failure: + rcu_read_unlock(); return -1; } @@ -703,7 +706,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) } static int -ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) +ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; @@ -849,6 +852,11 @@ static int ctnetlink_done(struct netlink_callback *cb) return 0; } +struct ctnetlink_filter_u32 { + u32 val; + u32 mask; +}; + struct ctnetlink_filter { u8 family; @@ -859,10 +867,8 @@ struct ctnetlink_filter { struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; - struct { - u_int32_t val; - u_int32_t mask; - } mark; + struct ctnetlink_filter_u32 mark; + struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { @@ -904,6 +910,46 @@ static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_zone *zone, u_int32_t flags); +static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, + const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK]) { + mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); + + if (cda[CTA_MARK_MASK]) + mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + else + mark->mask = 0xffffffff; + } else if (cda[CTA_MARK_MASK]) { + return -EINVAL; + } +#endif + return 0; +} + +static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, + const struct nlattr * const cda[]) +{ + if (cda[CTA_STATUS]) { + status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); + if (cda[CTA_STATUS_MASK]) + status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); + else + status->mask = status->val; + + /* status->val == 0? always true, else always false. */ + if (status->mask == 0) + return -EINVAL; + } else if (cda[CTA_STATUS_MASK]) { + return -EINVAL; + } + + /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ + BUILD_BUG_ON(__IPS_MAX_BIT >= 32); + return 0; +} + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { @@ -921,18 +967,14 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; -#ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK]) { - filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - if (cda[CTA_MARK_MASK]) - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); - else - filter->mark.mask = 0xffffffff; - } else if (cda[CTA_MARK_MASK]) { - err = -EINVAL; + err = ctnetlink_filter_parse_mark(&filter->mark, cda); + if (err) goto err_filter; - } -#endif + + err = ctnetlink_filter_parse_status(&filter->status, cda); + if (err) + goto err_filter; + if (!cda[CTA_FILTER]) return filter; @@ -986,7 +1028,7 @@ err_filter: static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { - return family || cda[CTA_MARK] || cda[CTA_FILTER]; + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS]; } static int ctnetlink_start(struct netlink_callback *cb) @@ -1079,6 +1121,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; + u32 status; if (filter == NULL) goto out; @@ -1110,6 +1153,9 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if ((ct->mark & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif + status = (u32)READ_ONCE(ct->status); + if ((status & filter->status.mask) != filter->status.val) + goto ignore_entry; out: return 1; @@ -1492,6 +1538,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, + [CTA_STATUS_MASK] = { .type = NLA_U32 }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1528,7 +1575,7 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -1541,12 +1588,12 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - nfmsg->nfgen_family, &zone); + family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - nfmsg->nfgen_family, &zone); + family, &zone); else { - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, @@ -1586,8 +1633,7 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -1628,9 +1674,8 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, ct = nf_ct_tuplehash_to_ctrack(h); - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } @@ -1640,21 +1685,12 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static int ctnetlink_done_list(struct netlink_callback *cb) @@ -2373,10 +2409,9 @@ static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; @@ -2493,7 +2528,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, - htonl(st->clash_resolve))) + htonl(st->clash_resolve)) || + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, + htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -2590,21 +2627,12 @@ static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -2643,6 +2671,8 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + + ctnetlink_acct_size(ct) + + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ @@ -2700,6 +2730,10 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; + if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) + goto nla_put_failure; + if (ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; @@ -3078,7 +3112,7 @@ nla_put_failure: #ifdef CONFIG_NF_CONNTRACK_EVENTS static int -ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) +ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); @@ -3278,8 +3312,7 @@ static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; @@ -3329,11 +3362,10 @@ static int ctnetlink_get_expect(struct sk_buff *skb, } } - err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; } rcu_read_lock(); @@ -3342,21 +3374,12 @@ static int ctnetlink_get_expect(struct sk_buff *skb, exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + } -free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) @@ -3378,8 +3401,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; @@ -3630,8 +3652,7 @@ static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; @@ -3742,11 +3763,8 @@ static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { - .fcn = ctnetlink_conntrack_event, -}; - -static struct nf_exp_event_notifier ctnl_notifier_exp = { - .fcn = ctnetlink_expect_event, + .ct_event = ctnetlink_conntrack_event, + .exp_event = ctnetlink_expect_event, }; #endif @@ -3839,52 +3857,21 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - int ret; - - ret = nf_conntrack_register_notifier(net, &ctnl_notifier); - if (ret < 0) { - pr_err("ctnetlink_init: cannot register notifier.\n"); - goto err_out; - } - - ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); - if (ret < 0) { - pr_err("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } + nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; - -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: - nf_conntrack_unregister_notifier(net, &ctnl_notifier); -err_out: - return ret; -#endif } -static void ctnetlink_net_exit(struct net *net) +static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); - nf_conntrack_unregister_notifier(net, &ctnl_notifier); + nf_conntrack_unregister_notifier(net); #endif } -static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) -{ - struct net *net; - - list_for_each_entry(net, net_exit_list, exit_list) - ctnetlink_net_exit(net); - - /* wait for other cpus until they are done with ctnl_notifiers */ - synchronize_rcu(); -} - static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, - .exit_batch = ctnetlink_net_exit_batch, + .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index dc9ca12b0489..8f7a9837349c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -42,17 +42,16 @@ #include <net/ipv6.h> #include <net/inet_frag.h> -extern unsigned int nf_conntrack_net_id; - static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL -__printf(5, 6) +__printf(4, 5) void nf_l4proto_log_invalid(const struct sk_buff *skb, - struct net *net, - u16 pf, u8 protonum, + const struct nf_hook_state *state, + u8 protonum, const char *fmt, ...) { + struct net *net = state->net; struct va_format vaf; va_list args; @@ -64,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_proto_%d: %pV ", protonum, &vaf); + nf_log_packet(net, state->pf, 0, skb, state->in, state->out, + NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid); -__printf(3, 4) +__printf(4, 5) void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, + const struct nf_hook_state *state, const char *fmt, ...) { struct va_format vaf; @@ -87,7 +87,7 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, vaf.fmt = fmt; vaf.va = &args; - nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct), + nf_l4proto_log_invalid(skb, state, nf_ct_protonum(ct), "%pV", &vaf); va_end(args); } @@ -446,7 +446,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info; static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); bool fixup_needed = false, retry = true; int err = 0; retry: @@ -531,7 +531,7 @@ retry: static void nf_ct_netns_do_put(struct net *net, u8 nfproto) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { @@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #endif } -void nf_conntrack_proto_pernet_fini(struct net *net) -{ -#ifdef CONFIG_NF_CT_PROTO_GRE - nf_ct_gre_keymap_flush(net); -#endif -} - module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 4f33307fa3cf..c1557d47ccd1 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -382,7 +382,8 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh) + const struct dccp_hdr *dh, + const struct nf_hook_state *hook_state) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; @@ -414,7 +415,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, "%s", msg); + nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); return false; } @@ -464,8 +465,7 @@ static bool dccp_error(const struct dccp_hdr *dh, } return false; out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_DCCP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); return true; } @@ -488,7 +488,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, return -NF_ACCEPT; type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) + if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) return -NF_ACCEPT; if (type == DCCP_PKT_RESET && @@ -543,11 +543,11 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet"); + nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition"); + nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index db11e403d818..728eeb0aea87 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net) return &net->ct.nf_ct_proto.gre; } -void nf_ct_gre_keymap_flush(struct net *net) -{ - struct nf_gre_net *net_gre = gre_pernet(net); - struct nf_ct_gre_keymap *km, *tmp; - - spin_lock_bh(&keymap_lock); - list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { - list_del_rcu(&km->list); - kfree_rcu(km, rcu); - } - spin_unlock_bh(&keymap_lock); -} - static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) { diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 4efd8741c105..b38b7164acd5 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -170,12 +170,12 @@ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { if (state->pf == AF_INET) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI4 != inner %pI4", &outer_daddr->ip, &ct_daddr->ip); } else if (state->pf == AF_INET6) { - nf_l4proto_log_invalid(skb, state->net, state->pf, + nf_l4proto_log_invalid(skb, state, l4proto, "outer daddr %pI6 != inner %pI6", &outer_daddr->ip6, &ct_daddr->ip6); @@ -197,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index facd8c64ec4e..61e3b05cf02c 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -126,8 +126,7 @@ static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_ICMPV6, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fb8dc02e502f..2394238d01c9 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -351,7 +351,7 @@ static bool sctp_error(struct sk_buff *skb, } return false; out_invalid: - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg); + nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 34e22416a721..af5115e127cf 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -446,14 +446,15 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static bool tcp_in_window(const struct nf_conn *ct, - struct ip_ct_tcp *state, +static bool tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *tcph) + const struct tcphdr *tcph, + const struct nf_hook_state *hook_state) { + struct ip_ct_tcp *state = &ct->proto.tcp; struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; @@ -670,7 +671,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tn->tcp_be_liberal) res = true; if (!res) { - nf_ct_l4proto_log_invalid(skb, ct, + nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? @@ -710,7 +711,7 @@ static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ @@ -822,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } +static bool tcp_can_early_drop(const struct nf_conn *ct) +{ + switch (ct->proto.tcp.state) { + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + case TCP_CONNTRACK_CLOSE: + case TCP_CONNTRACK_CLOSE_WAIT: + return true; + default: + break; + } + + return false; +} + /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -970,7 +987,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, + nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", index, dir, tcp_conntrack_names[old_state]); @@ -995,7 +1012,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); + nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" @@ -1010,7 +1027,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); + nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; @@ -1029,13 +1046,33 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (index != TCP_RST_SET) break; - if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) { + /* If we are closing, tuple might have been re-used already. + * last_index, last_ack, and all other ct fields used for + * sequence/window validation are outdated in that case. + * + * As the conntrack can already be expired by GC under pressure, + * just skip validation checks. + */ + if (tcp_can_early_drop(ct)) + goto in_window; + + /* td_maxack might be outdated if we let a SYN through earlier */ + if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && + ct->proto.tcp.last_index != TCP_SYN_SET) { u32 seq = ntohl(th->seq); - if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) { + /* If we are not in established state and SEQ=0 this is most + * likely an answer to a SYN we let go through above (last_index + * can be updated due to out-of-order ACKs). + */ + if (seq == 0 && !nf_conntrack_tcp_established(ct)) + break; + + if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && + !tn->tcp_ignore_invalid_rst) { /* Invalid RST */ spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); + nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } @@ -1079,8 +1116,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, break; } - if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, - skb, dataoff, th)) { + if (!tcp_in_window(ct, dir, index, + skb, dataoff, th, state)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } @@ -1133,6 +1170,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } + + if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { + /* do not renew timeout on SYN retransmit. + * + * Else port reuse by client or NAT middlebox can keep + * entry alive indefinitely (including nat info). + */ + return NF_ACCEPT; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ @@ -1154,22 +1201,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, return NF_ACCEPT; } -static bool tcp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.tcp.state) { - case TCP_CONNTRACK_FIN_WAIT: - case TCP_CONNTRACK_LAST_ACK: - case TCP_CONNTRACK_TIME_WAIT: - case TCP_CONNTRACK_CLOSE: - case TCP_CONNTRACK_CLOSE_WAIT: - return true; - default: - break; - } - - return false; -} - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> @@ -1436,11 +1467,18 @@ void nf_conntrack_tcp_init_net(struct net *net) */ tn->tcp_be_liberal = 0; + /* If it's non-zero, we turn off RST sequence number check */ + tn->tcp_ignore_invalid_rst = 0; + /* Max number of the retransmitted packets without receiving an (acceptable) * ACK from the destination. If this number is reached, a shorter timer * will be started. */ tn->tcp_max_retrans = 3; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + tn->offload_timeout = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index af402f458ee0..f8e3c0d2602f 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -38,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDP, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, @@ -130,8 +129,7 @@ static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { - nf_l4proto_log_invalid(skb, state->net, state->pf, - IPPROTO_UDPLITE, "%s", msg); + nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, @@ -270,6 +268,10 @@ void nf_conntrack_udp_init_net(struct net *net) for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + un->offload_timeout = 30 * HZ; +#endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index aaa55246d0ca..80f675d884b2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,6 +22,9 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> +#ifdef CONFIG_LWTUNNEL +#include <net/netfilter/nf_hooks_lwtunnel.h> +#endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; @@ -429,7 +432,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { - seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } @@ -444,7 +447,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->invalid, 0, 0, - 0, + st->chaintoolong, st->insert, st->insert_failed, st->drop, @@ -512,9 +515,7 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) u32 nf_conntrack_count(const struct net *net) { - const struct nf_conntrack_net *cnet; - - cnet = net_generic(net, nf_conntrack_net_id); + const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } @@ -575,11 +576,18 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, +#endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, + NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, +#endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP @@ -607,6 +615,9 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif +#ifdef CONFIG_LWTUNNEL + NF_SYSCTL_CT_LWTUNNEL, +#endif __NF_SYSCTL_CT_LAST_SYSCTL, }; @@ -762,6 +773,14 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { + .procname = "nf_flowtable_tcp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), @@ -778,6 +797,14 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { + .procname = "nf_conntrack_tcp_ignore_invalid_rst", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(u8), @@ -796,6 +823,14 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, +#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD) + [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { + .procname = "nf_flowtable_udp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), @@ -930,6 +965,15 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_LWTUNNEL + [NF_SYSCTL_CT_LWTUNNEL] = { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +#endif {} }; @@ -970,7 +1014,13 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); + XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN + +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; +#endif + } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, @@ -1032,7 +1082,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1059,6 +1109,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; +#if IS_ENABLED(CONFIG_NF_FLOW_TABLE) + table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; +#endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); @@ -1085,7 +1138,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { - struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1d02650dd715..87a7388b6c89 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -99,7 +99,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: - flow_tuple->mtu = ip6_dst_mtu_forward(dst); + flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } @@ -178,25 +178,28 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) tcp->seen[1].td_maxwin = 0; } -#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) -#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) - static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { - const struct nf_conntrack_l4proto *l4proto; + struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); - unsigned int timeout; + s32 timeout; - l4proto = nf_ct_l4proto_find(l4num); - if (!l4proto) - return; + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); - if (l4num == IPPROTO_TCP) - timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; - else if (l4num == IPPROTO_UDP) - timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; - else + timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout -= tn->offload_timeout; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->timeouts[UDP_CT_REPLIED]; + timeout -= tn->offload_timeout; + } else { return; + } + + if (timeout < 0) + timeout = 0; if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) ct->timeout = nfct_time_stamp + timeout; @@ -268,11 +271,30 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { .automatic_shrinking = true, }; +unsigned long flow_offload_get_timeout(struct flow_offload *flow) +{ + unsigned long timeout = NF_FLOW_TIMEOUT; + struct net *net = nf_ct_net(flow->ct); + int l4num = nf_ct_protonum(flow->ct); + + if (l4num == IPPROTO_TCP) { + struct nf_tcp_net *tn = nf_tcp_pernet(net); + + timeout = tn->offload_timeout; + } else if (l4num == IPPROTO_UDP) { + struct nf_udp_net *tn = nf_udp_pernet(net); + + timeout = tn->offload_timeout; + } + + return timeout; +} + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, @@ -304,7 +326,11 @@ EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; + u32 timeout; + + timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); + if (READ_ONCE(flow->timeout) != timeout) + WRITE_ONCE(flow->timeout, timeout); if (likely(!nf_flowtable_hw_offload(flow_table))) return; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 528b2f172684..d6bf1b2cd541 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -251,8 +251,7 @@ static int flow_offload_eth_src(struct net *net, flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, &val, &mask); - if (dev) - dev_put(dev); + dev_put(dev); return 0; } @@ -937,7 +936,7 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, - lastused + NF_FLOW_TIMEOUT); + lastused + flow_offload_get_timeout(offload->flow)); if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { if (stats[0].pkts) @@ -1041,7 +1040,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) + if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10)) return; offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); @@ -1097,6 +1096,7 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &flowtable->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c new file mode 100644 index 000000000000..00e89ffd78f6 --- /dev/null +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sysctl.h> +#include <net/lwtunnel.h> +#include <net/netfilter/nf_hooks_lwtunnel.h> + +static inline int nf_hooks_lwtunnel_get(void) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return 1; + else + return 0; +} + +static inline int nf_hooks_lwtunnel_set(int enable) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) { + if (!enable) + return -EBUSY; + } else if (enable) { + static_branch_enable(&nf_hooks_lwtunnel_enabled); + } + + return 0; +} + +#ifdef CONFIG_SYSCTL +int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int proc_nf_hooks_lwtunnel_enabled = 0; + struct ctl_table tmp = { + .procname = table->procname, + .data = &proc_nf_hooks_lwtunnel_enabled, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + int ret; + + if (!write) + proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get(); + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..7008961f5cb0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -13,7 +13,7 @@ #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> @@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +static siphash_key_t nf_nat_hash_rnd __read_mostly; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; @@ -153,12 +153,22 @@ static unsigned int hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + struct { + struct nf_conntrack_man src; + u32 net_mix; + u32 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + memset(&combined, 0, sizeof(combined)); + /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); + combined.src = tuple->src; + combined.net_mix = net_hash_mix(n); + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index bbd1209694b8..6d12afabfe8a 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -21,6 +21,8 @@ #include "nf_internals.h" +static const struct nf_queue_handler __rcu *nf_queue_handler; + /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. @@ -29,20 +31,18 @@ * receives, no matter what. */ -/* return EBUSY when somebody else is registered, return EEXIST if the - * same handler is registered, return 0 in case of success. */ -void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ - WARN_ON(rcu_access_pointer(net->nf.queue_handler)); - rcu_assign_pointer(net->nf.queue_handler, qh); + WARN_ON(rcu_access_pointer(nf_queue_handler)); + rcu_assign_pointer(nf_queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -void nf_unregister_queue_handler(struct net *net) +void nf_unregister_queue_handler(void) { - RCU_INIT_POINTER(net->nf.queue_handler, NULL); + RCU_INIT_POINTER(nf_queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); @@ -51,18 +51,14 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ - if (state->in) - dev_put(state->in); - if (state->out) - dev_put(state->out); + dev_put(state->in); + dev_put(state->out); if (state->sk) sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_put(entry->physin); - if (entry->physout) - dev_put(entry->physout); + dev_put(entry->physin); + dev_put(entry->physout); #endif } @@ -95,18 +91,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; - if (state->in) - dev_hold(state->in); - if (state->out) - dev_hold(state->out); + dev_hold(state->in); + dev_hold(state->out); if (state->sk) sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_hold(entry->physin); - if (entry->physout) - dev_hold(entry->physout); + dev_hold(entry->physin); + dev_hold(entry->physout); #endif } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -116,7 +108,7 @@ void nf_queue_nf_hook_drop(struct net *net) const struct nf_queue_handler *qh; rcu_read_lock(); - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (qh) qh->nf_hook_drop(net); rcu_read_unlock(); @@ -157,12 +149,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, { struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; - struct net *net = state->net; unsigned int route_key_size; int status; /* QUEUE == DROP if no one is waiting, to be safe. */ - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (!qh) return -ESRCH; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bf4d6ec9fc55..081437dd75b7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -571,7 +571,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && - table->nlpid != nlpid) + nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; @@ -583,7 +583,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; @@ -591,8 +591,13 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + nlpid && table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -862,10 +867,9 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct sk_buff *skb2; @@ -1068,10 +1072,9 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1263,10 +1266,9 @@ out: static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -1279,7 +1281,8 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; - table = nft_table_lookup_byhandle(net, attr, genmask); + table = nft_table_lookup_byhandle(net, attr, genmask, + NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, @@ -1636,10 +1639,9 @@ done: static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; struct net *net = info->net; struct nft_table *table; @@ -2015,11 +2017,12 @@ static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family, const struct nft_chain_hook *hook, struct nft_chain *chain) { - ops->pf = family; - ops->hooknum = hook->num; - ops->priority = hook->priority; - ops->priv = chain; - ops->hook = hook->type->hooks[ops->hooknum]; + ops->pf = family; + ops->hooknum = hook->num; + ops->priority = hook->priority; + ops->priv = chain; + ops->hook = hook->type->hooks[ops->hooknum]; + ops->hook_ops_type = NF_HOOK_OP_NF_TABLES; } static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, @@ -2371,10 +2374,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; const struct nlattr *attr; @@ -2469,10 +2471,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -3096,10 +3097,9 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; struct net *net = info->net; @@ -3237,15 +3237,14 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { struct nftables_pernet *nft_net = nft_pernet(info->net); - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; unsigned int size, i, n, ulen = 0, usize = 0; u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; + struct nft_flow_rule *flow = NULL; struct net *net = info->net; - struct nft_flow_rule *flow; struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; @@ -3340,13 +3339,13 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) - goto err1; + goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) - goto err1; + goto err_release_expr; err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) { NL_SET_BAD_ATTR(extack, tmp); - goto err1; + goto err_release_expr; } size += expr_info[n].ops->size; n++; @@ -3355,7 +3354,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) - goto err1; + goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); @@ -3366,7 +3365,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = -ENOMEM; rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) - goto err1; + goto err_release_expr; nft_activate_next(net, rule); @@ -3385,7 +3384,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { NL_SET_BAD_ATTR(extack, expr_info[i].attr); - goto err2; + goto err_release_rule; } if (expr_info[i].ops->validate) @@ -3395,16 +3394,24 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, expr = nft_expr_next(expr); } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(net, rule); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto err_release_rule; + } + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); @@ -3412,7 +3419,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } if (info->nlh->nlmsg_flags & NLM_F_APPEND) { @@ -3430,21 +3437,20 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, kvfree(expr_info); chain->use++; + if (flow) + nft_trans_flow_rule(trans) = flow; + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(net, rule); - if (IS_ERR(flow)) - return PTR_ERR(flow); - - nft_trans_flow_rule(trans) = flow; - } - return 0; -err2: + +err_destroy_flow_rule: + if (flow) + nft_flow_rule_destroy(flow); +err_release_rule: nf_tables_rule_release(&ctx, rule); -err1: +err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { module_put(expr_info[i].ops->type->owner); @@ -3477,15 +3483,15 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; - int family = nfmsg->nfgen_family, err = 0; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct nft_chain *chain = NULL; struct net *net = info->net; struct nft_table *table; struct nft_rule *rule; struct nft_ctx ctx; + int err = 0; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -3665,30 +3671,6 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table = NULL; - - if (nla[NFTA_SET_TABLE] != NULL) { - table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); - return PTR_ERR(table); - } - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static struct nft_set *nft_set_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { @@ -4068,20 +4050,26 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; + struct nft_table *table = NULL; struct net *net = info->net; const struct nft_set *set; struct sk_buff *skb2; struct nft_ctx ctx; int err; - /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, 0); - if (err < 0) - return err; + if (nla[NFTA_SET_TABLE]) { + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, 0); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } + } + + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -4096,12 +4084,12 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, } /* Only accept unspec with dump */ - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -4189,11 +4177,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u32 ktype, dtype, flags, policy, gc_int, objtype; struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_set_ops *ops; struct nft_expr *expr = NULL; struct net *net = info->net; @@ -4494,31 +4481,31 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - int err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - if (nla[NFTA_SET_TABLE] == NULL) - return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); + return PTR_ERR(table); + } if (nla[NFTA_SET_HANDLE]) { attr = nla[NFTA_SET_HANDLE]; - set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(ctx.table, attr, genmask); + set = nft_set_lookup(table, attr, genmask); } if (IS_ERR(set)) { @@ -4532,6 +4519,8 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nft_delset(&ctx, set); } @@ -4733,28 +4722,6 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack, - u8 genmask, u32 nlpid) -{ - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family; - struct nft_table *table; - - table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, - genmask, nlpid); - if (IS_ERR(table)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); - return PTR_ERR(table); - } - - nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); - return 0; -} - static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_ext *ext) @@ -5212,21 +5179,27 @@ static int nf_tables_getsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; + struct nft_table *table; struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, @@ -5995,8 +5968,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb, struct nftables_pernet *nft_net = nft_pernet(info->net); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err; @@ -6004,12 +5979,14 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -6017,6 +5994,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) @@ -6024,7 +6003,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb, } if (nft_net->validate_state == NFT_VALIDATE_DO) - return nft_table_validate(net, ctx.table); + return nft_table_validate(net, table); return 0; } @@ -6262,23 +6241,29 @@ static int nf_tables_delsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; + struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, - genmask, NETLINK_CB(skb).portid); - if (err < 0) - return err; + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask, NETLINK_CB(skb).portid); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); + return PTR_ERR(table); + } - set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return nft_set_flush(&ctx, set, genmask); @@ -6546,11 +6531,10 @@ err_free_trans: static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nft_object_type *type; - int family = nfmsg->nfgen_family; struct net *net = info->net; struct nft_table *table; struct nft_object *obj; @@ -6802,10 +6786,9 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; const struct nft_table *table; struct net *net = info->net; struct nft_object *obj; @@ -6892,10 +6875,9 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; @@ -7323,12 +7305,11 @@ static int nf_tables_newflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; u8 genmask = nft_genmask_next(info->net); + u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; - int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nft_hook *hook, *next; struct net *net = info->net; @@ -7512,10 +7493,9 @@ static int nf_tables_delflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_next(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct net *net = info->net; const struct nlattr *attr; @@ -7707,9 +7687,8 @@ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u8 genmask = nft_genmask_cur(info->net); - int family = nfmsg->nfgen_family; + u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; struct net *net = info->net; @@ -8466,6 +8445,16 @@ static int nf_tables_commit_audit_alloc(struct list_head *adl, return 0; } +static void nf_tables_commit_audit_free(struct list_head *adl) +{ + struct nft_audit_data *adp, *adn; + + list_for_each_entry_safe(adp, adn, adl, list) { + list_del(&adp->list); + kfree(adp); + } +} + static void nf_tables_commit_audit_collect(struct list_head *adl, struct nft_table *table, u32 op) { @@ -8530,6 +8519,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); if (ret) { nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); return ret; } if (trans->msg_type == NFT_MSG_NEWRULE || @@ -8539,6 +8529,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); return ret; } } @@ -8839,11 +8830,16 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index dbc2e945c98e..866cfba04d6c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -81,7 +81,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!pkt->tprot_set) return false; - ptr = skb_network_header(skb) + pkt->xt.thoff; + ptr = skb_network_header(skb) + nft_thoff(pkt); } ptr += priv->offset; @@ -268,6 +268,7 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_meta_type, &nft_rt_type, &nft_exthdr_type, + &nft_last_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index a48c5fd53a80..9656c1646222 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -54,15 +54,10 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; - struct nft_offload_ethertype ethertype; - - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL) && - match->key.basic.n_proto != htons(ETH_P_8021Q) && - match->key.basic.n_proto != htons(ETH_P_8021AD)) - return; - - ethertype.value = match->key.basic.n_proto; - ethertype.mask = match->mask.basic.n_proto; + struct nft_offload_ethertype ethertype = { + .value = match->key.basic.n_proto, + .mask = match->mask.basic.n_proto, + }; if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || @@ -76,7 +71,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else { + } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + (match->key.basic.n_proto == htons(ETH_P_8021Q) || + match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; @@ -356,6 +353,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } @@ -594,23 +592,6 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) - continue; - - switch (trans->msg_type) { - case NFT_MSG_NEWRULE: - case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) - continue; - - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - break; - default: - break; - } - } - return err; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 0cf3278007ba..e4fe2f0780eb 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,17 +113,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + nh_end = pkt->tprot_set ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; if (pkt->tprot_set) { - len = min_t(unsigned int, skb->len - pkt->xt.thoff, + len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) + nft_thoff(pkt), len)) return -1; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e8dbd8379027..7e2c8dd01408 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -68,6 +68,7 @@ static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", + [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", }; static const int nfnl_group2type[NFNLGRP_MAX+1] = { @@ -256,6 +257,7 @@ replay: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = extack, }; @@ -491,6 +493,7 @@ replay_abort: .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, + .nfmsg = nlmsg_data(nlh), .extack = &extack, }; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3c8cf8748cfb..505f46a32173 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -314,14 +314,11 @@ static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 752b10cae524..5c622f55c9d6 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -667,14 +667,10 @@ static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 38848ad68899..c57673d499be 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -287,14 +287,11 @@ static int cttimeout_get_timeout(struct sk_buff *skb, kfree_skb(skb2); break; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); + break; } + return ret; } @@ -427,9 +424,9 @@ static int cttimeout_default_get(struct sk_buff *skb, const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -438,9 +435,8 @@ static int cttimeout_default_get(struct sk_buff *skb, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); - err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: @@ -480,13 +476,11 @@ static int cttimeout_default_get(struct sk_buff *skb, } if (!timeouts) - goto err; + return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, @@ -496,18 +490,10 @@ static int cttimeout_default_get(struct sk_buff *skb, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c new file mode 100644 index 000000000000..f554e2ea32ee --- /dev/null +++ b/net/netfilter/nfnetlink_hook.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021 Red Hat GmbH + * + * Author: Florian Westphal <fw@strlen.de> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_hook.h> + +#include <net/netfilter/nf_tables.h> +#include <net/sock.h> + +static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = { + [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 }, + [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFNLA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, + [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING, + .len = KSYM_NAME_LEN, }, + [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING, + .len = MODULE_NAME_LEN, }, + [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, }, +}; + +static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +struct nfnl_dump_hook_data { + char devname[IFNAMSIZ]; + unsigned long headv; + u8 hook; +}; + +static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + const struct nf_hook_ops *ops) +{ + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest, *nest2; + struct nft_chain *chain; + int ret = 0; + + if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES) + return 0; + + chain = ops->priv; + if (WARN_ON_ONCE(!chain)) + return 0; + + if (!nft_is_active(net, chain)) + return 0; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, + htonl(NFNL_HOOK_TYPE_NFTABLES)); + if (ret) + goto cancel_nest; + + nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest2) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name); + if (ret) + goto cancel_nest; + + ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name); + if (ret) + goto cancel_nest; + + ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family); + if (ret) + goto cancel_nest; + + nla_nest_end(nlskb, nest2); + nla_nest_end(nlskb, nest); + return ret; + +cancel_nest: + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; +} + +static int nfnl_hook_dump_one(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + const struct nf_hook_ops *ops, + int family, unsigned int seq) +{ + u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET); + unsigned int portid = NETLINK_CB(nlskb).portid; + struct nlmsghdr *nlh; + int ret = -EMSGSIZE; + u32 hooknum; +#ifdef CONFIG_KALLSYMS + char sym[KSYM_SYMBOL_LEN]; + char *module_name; +#endif + nlh = nfnl_msg_put(nlskb, portid, seq, event, + NLM_F_MULTI, family, NFNETLINK_V0, 0); + if (!nlh) + goto nla_put_failure; + +#ifdef CONFIG_KALLSYMS + ret = snprintf(sym, sizeof(sym), "%ps", ops->hook); + if (ret >= sizeof(sym)) { + ret = -EINVAL; + goto nla_put_failure; + } + + module_name = strstr(sym, " ["); + if (module_name) { + char *end; + + *module_name = '\0'; + module_name += 2; + end = strchr(module_name, ']'); + if (end) { + *end = 0; + + ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name); + if (ret) + goto nla_put_failure; + } + } + + ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym); + if (ret) + goto nla_put_failure; +#endif + + if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS) + hooknum = NF_NETDEV_INGRESS; + else + hooknum = ops->hooknum; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum)); + if (ret) + goto nla_put_failure; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority)); + if (ret) + goto nla_put_failure; + + ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops); + if (ret) + goto nla_put_failure; + + nlmsg_end(nlskb, nlh); + return 0; +nla_put_failure: + nlmsg_trim(nlskb, nlh); + return ret; +} + +static const struct nf_hook_entries * +nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) +{ + const struct nf_hook_entries *hook_head = NULL; +#ifdef CONFIG_NETFILTER_INGRESS + struct net_device *netdev; +#endif + + switch (pf) { + case NFPROTO_IPV4: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + break; + case NFPROTO_ARP: +#ifdef CONFIG_NETFILTER_FAMILY_ARP + if (hook >= ARRAY_SIZE(net->nf.hooks_arp)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_arp[hook]); +#endif + break; + case NFPROTO_BRIDGE: +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); +#endif + break; +#if IS_ENABLED(CONFIG_DECNET) + case NFPROTO_DECNET: + if (hook >= ARRAY_SIZE(net->nf.hooks_decnet)) + return ERR_PTR(-EINVAL); + hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); + break; +#endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_NETDEV: + if (hook != NF_NETDEV_INGRESS) + return ERR_PTR(-EOPNOTSUPP); + + if (!dev) + return ERR_PTR(-ENODEV); + + netdev = dev_get_by_name_rcu(net, dev); + if (!netdev) + return ERR_PTR(-ENODEV); + + return rcu_dereference(netdev->nf_hooks_ingress); +#endif + default: + return ERR_PTR(-EPROTONOSUPPORT); + } + + return hook_head; +} + +static int nfnl_hook_dump(struct sk_buff *nlskb, + struct netlink_callback *cb) +{ + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nfnl_dump_hook_data *ctx = cb->data; + int err, family = nfmsg->nfgen_family; + struct net *net = sock_net(nlskb->sk); + struct nf_hook_ops * const *ops; + const struct nf_hook_entries *e; + unsigned int i = cb->args[0]; + + rcu_read_lock(); + + e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname); + if (!e) + goto done; + + if (IS_ERR(e)) { + cb->seq++; + goto done; + } + + if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries) + cb->seq++; + + ops = nf_hook_entries_get_hook_ops(e); + + for (; i < e->num_hook_entries; i++) { + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family, + cb->nlh->nlmsg_seq); + if (err) + break; + } + +done: + nl_dump_check_consistent(cb, nlmsg_hdr(nlskb)); + rcu_read_unlock(); + cb->args[0] = i; + return nlskb->len; +} + +static int nfnl_hook_dump_start(struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nlattr * const *nla = cb->data; + struct nfnl_dump_hook_data *ctx = NULL; + struct net *net = sock_net(cb->skb->sk); + u8 family = nfmsg->nfgen_family; + char name[IFNAMSIZ] = ""; + const void *head; + u32 hooknum; + + hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM])); + if (hooknum > 255) + return -EINVAL; + + if (family == NFPROTO_NETDEV) { + if (!nla[NFNLA_HOOK_DEV]) + return -EINVAL; + + nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name)); + } + + rcu_read_lock(); + /* Not dereferenced; for consistency check only */ + head = nfnl_hook_entries_head(family, hooknum, net, name); + rcu_read_unlock(); + + if (head && IS_ERR(head)) + return PTR_ERR(head); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + strscpy(ctx->devname, name, sizeof(ctx->devname)); + ctx->headv = (unsigned long)head; + ctx->hook = hooknum; + + cb->seq = 1; + cb->data = ctx; + + return 0; +} + +static int nfnl_hook_dump_stop(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static int nfnl_hook_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + if (!nla[NFNLA_HOOK_HOOKNUM]) + return -EINVAL; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nfnl_hook_dump_start, + .done = nfnl_hook_dump_stop, + .dump = nfnl_hook_dump, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + return -EOPNOTSUPP; +} + +static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = { + [NFNL_MSG_HOOK_GET] = { + .call = nfnl_hook_get, + .type = NFNL_CB_RCU, + .attr_count = NFNLA_HOOK_MAX, + .policy = nfnl_hook_nla_policy + }, +}; + +static const struct nfnetlink_subsystem nfhook_subsys = { + .name = "nfhook", + .subsys_id = NFNL_SUBSYS_HOOK, + .cb_count = NFNL_MSG_HOOK_MAX, + .cb = nfnl_hook_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK); + +static int __init nfnetlink_hook_init(void) +{ + return nfnetlink_subsys_register(&nfhook_subsys); +} + +static void __exit nfnetlink_hook_exit(void) +{ + nfnetlink_subsys_unregister(&nfhook_subsys); +} + +module_init(nfnetlink_hook_init); +module_exit(nfnetlink_hook_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks"); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 587086b18c36..691ef4cffdd9 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -871,15 +871,14 @@ static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfula[]) { struct nfnl_log_net *log = nfnl_log_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t group_num = ntohs(nfmsg->res_id); + u_int16_t group_num = ntohs(info->nfmsg->res_id); struct nfulnl_msg_config_cmd *cmd = NULL; struct nfulnl_instance *inst; u16 flags = 0; int ret = 0; if (nfula[NFULA_CFG_CMD]) { - u_int8_t pf = nfmsg->nfgen_family; + u_int8_t pf = info->nfmsg->nfgen_family; cmd = nla_data(nfula[NFULA_CFG_CMD]); /* Commands without queue context */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f37a575ebd7f..4c3fbaaeb103 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -951,6 +951,16 @@ static void nfqnl_nf_hook_drop(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; + /* This function is also called on net namespace error unwind, + * when pernet_ops->init() failed and ->exit() functions of the + * previous pernet_ops gets called. + * + * This may result in a call to nfqnl_nf_hook_drop() before + * struct nfnl_queue_net was allocated. + */ + if (!q) + return; + for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; @@ -1051,8 +1061,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u16 queue_num = ntohs(nfmsg->res_id); + u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; @@ -1160,8 +1169,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; @@ -1243,8 +1251,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); - struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); - u_int16_t queue_num = ntohs(nfmsg->res_id); + u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; @@ -1505,7 +1512,6 @@ static int __net_init nfnl_queue_net_init(struct net *net) &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif - nf_register_queue_handler(net, &nfqh); return 0; } @@ -1514,7 +1520,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; - nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif @@ -1558,6 +1563,8 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_subsys; } + nf_register_queue_handler(&nfqh); + return status; cleanup_netlink_subsys: @@ -1571,6 +1578,7 @@ out: static void __exit nfnetlink_queue_fini(void) { + nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 363bdd7044ec..5b02408a920b 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } @@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } @@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } @@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, switch (state->pf) { case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; default: break; @@ -174,7 +174,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv4_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) return NF_DROP; break; case htons(ETH_P_IPV6): @@ -182,7 +182,7 @@ static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); - if (nft_set_pktinfo_ipv6_ingress(&pkt, skb) < 0) + if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) return NF_DROP; break; default: @@ -238,13 +238,13 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } @@ -293,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); + nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); + nft_set_pktinfo_ipv6_validate(&pkt); break; default: - nft_set_pktinfo_unspec(&pkt, skb); + nft_set_pktinfo_unspec(&pkt); break; } diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index eac4a901233f..98e4946100c5 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, switch (state->pf) { #ifdef CONFIG_NF_TABLES_IPV4 case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); break; #endif #ifdef CONFIG_NF_TABLES_IPV6 case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); break; #endif default: diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c index edd02cda57fc..925db0dce48d 100644 --- a/net/netfilter/nft_chain_route.c +++ b/net/netfilter/nft_chain_route.c @@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv, u8 tos; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); + nft_set_pktinfo_ipv4(&pkt); mark = skb->mark; iph = ip_hdr(skb); @@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv, int err; nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); + nft_set_pktinfo_ipv6(&pkt); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5415ab14400d..272bcdb1392d 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -57,8 +57,13 @@ union nft_entry { }; static inline void -nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) +nft_compat_set_par(struct xt_action_param *par, + const struct nft_pktinfo *pkt, + const void *xt, const void *xt_info) { + par->state = pkt->state; + par->thoff = nft_thoff(pkt); + par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; @@ -71,13 +76,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -97,13 +103,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; int ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info); + nft_compat_set_par(&xt, pkt, target, info); - ret = target->target(skb, &pkt->xt); + ret = target->target(skb, &xt); - if (pkt->xt.hotdrop) + if (xt.hotdrop) ret = NF_DROP; switch (ret) { @@ -350,13 +357,14 @@ static void __nft_match_eval(const struct nft_expr *expr, { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; + struct xt_action_param xt; bool ret; - nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info); + nft_compat_set_par(&xt, pkt, match, info); - ret = match->match(skb, (struct xt_action_param *)&pkt->xt); + ret = match->match(skb, &xt); - if (pkt->xt.hotdrop) { + if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } @@ -617,7 +625,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { - struct nfgenmsg *nfmsg; + u8 family = info->nfmsg->nfgen_family; const char *name, *fmt; struct sk_buff *skb2; int ret = 0, target; @@ -632,9 +640,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(info->nlh); - - switch(nfmsg->nfgen_family) { + switch(family) { case AF_INET: fmt = "ipt_%s"; break; @@ -648,8 +654,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, fmt = "arpt_%s"; break; default: - pr_err("nft_compat: unsupported protocol %d\n", - nfmsg->nfgen_family); + pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } @@ -657,9 +662,8 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, return -EINVAL; rcu_read_unlock(); - try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, - rev, target, &ret), - fmt, name); + try_then_request_module(xt_find_revision(family, name, rev, target, &ret), + fmt, name); if (ret < 0) goto out_put; @@ -674,20 +678,17 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, - nfmsg->nfgen_family, - name, ret, target) <= 0) { + family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); - return ret == -EAGAIN ? -ENOBUFS : ret; + + return ret; } static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 337e22d8b40b..99b1de14ff7e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -41,6 +41,7 @@ struct nft_ct_helper_obj { #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, @@ -525,8 +526,10 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: @@ -564,9 +567,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: - if (!nft_ct_tmpl_alloc_pcpu()) + mutex_lock(&nft_ct_pcpu_mutex); + if (!nft_ct_tmpl_alloc_pcpu()) { + mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; + } nft_ct_pcpu_template_refcnt++; + mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index f64f0017e9a5..af4ee874a067 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,8 +10,10 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +#include <net/sctp/sctp.h> #include <net/tcp.h> struct nft_exthdr { @@ -42,6 +44,9 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, unsigned int offset = 0; int err; + if (pkt->skb->protocol != htons(ETH_P_IPV6)) + goto err; + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); @@ -162,10 +167,10 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, { struct tcphdr *tcph; - if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + if (pkt->tprot != IPPROTO_TCP) return NULL; - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; @@ -173,7 +178,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; - return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); + return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, @@ -249,7 +254,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; if (skb_ensure_writable(pkt->skb, - pkt->xt.thoff + i + priv->len)) + nft_thoff(pkt) + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -300,6 +305,48 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, } } +static void nft_exthdr_sctp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + + if (pkt->tprot != IPPROTO_SCTP) + goto err; + + do { + sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); + if (!sch || !sch->length) + break; + + if (sch->type == priv->type) { + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + nft_reg_store8(dest, true); + return; + } + if (priv->offset + priv->len > ntohs(sch->length) || + offset + ntohs(sch->length) > pkt->skb->len) + break; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) + break; + return; + } + offset += SCTP_PAD4(ntohs(sch->length)); + } while (offset < pkt->skb->len); +err: + if (priv->flags & NFT_EXTHDR_F_PRESENT) + nft_reg_store8(dest, false); + else + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -499,6 +546,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .dump = nft_exthdr_dump_set, }; +static const struct nft_expr_ops nft_exthdr_sctp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_sctp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -529,6 +584,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_ipv4_ops; } break; + case NFT_EXTHDR_OP_SCTP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_sctp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 4843dd2b410c..0af34ad41479 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -291,7 +291,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst)) goto out; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c new file mode 100644 index 000000000000..304e33cbed9b --- /dev/null +++ b/net/netfilter/nft_last.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_last_priv { + unsigned long last_jiffies; + unsigned int last_set; +}; + +static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = { + [NFTA_LAST_SET] = { .type = NLA_U32 }, + [NFTA_LAST_MSECS] = { .type = NLA_U64 }, +}; + +static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + u64 last_jiffies; + u32 last_set = 0; + int err; + + if (tb[NFTA_LAST_SET]) { + last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET])); + if (last_set == 1) + priv->last_set = 1; + } + + if (last_set && tb[NFTA_LAST_MSECS]) { + err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies); + if (err < 0) + return err; + + priv->last_jiffies = jiffies - (unsigned long)last_jiffies; + } + + return 0; +} + +static void nft_last_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + + if (READ_ONCE(priv->last_jiffies) != jiffies) + WRITE_ONCE(priv->last_jiffies, jiffies); + if (READ_ONCE(priv->last_set) == 0) + WRITE_ONCE(priv->last_set, 1); +} + +static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_last_priv *priv = nft_expr_priv(expr); + unsigned long last_jiffies = READ_ONCE(priv->last_jiffies); + u32 last_set = READ_ONCE(priv->last_set); + __be64 msecs; + + if (time_before(jiffies, last_jiffies)) { + WRITE_ONCE(priv->last_set, 0); + last_set = 0; + } + + if (last_set) + msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies); + else + msecs = 0; + + if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) || + nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_last_ops = { + .type = &nft_last_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)), + .eval = nft_last_eval, + .init = nft_last_init, + .dump = nft_last_dump, +}; + +struct nft_expr_type nft_last_type __read_mostly = { + .name = "last", + .ops = &nft_last_ops, + .policy = nft_last_policy, + .maxattr = NFTA_LAST_MAX, + .flags = NFT_EXPR_STATEFUL, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a479f8a1270c..90becbf5bff3 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -23,6 +23,37 @@ struct nft_lookup { struct nft_set_binding binding; }; +#ifdef CONFIG_RETPOLINE +bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + if (set->ops == &nft_set_hash_fast_type.ops) + return nft_hash_lookup_fast(net, set, key, ext); + if (set->ops == &nft_set_hash_type.ops) + return nft_hash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_rhash_type.ops) + return nft_rhash_lookup(net, set, key, ext); + + if (set->ops == &nft_set_bitmap_type.ops) + return nft_bitmap_lookup(net, set, key, ext); + + if (set->ops == &nft_set_pipapo_type.ops) + return nft_pipapo_lookup(net, set, key, ext); +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (set->ops == &nft_set_pipapo_avx2_type.ops) + return nft_pipapo_avx2_lookup(net, set, key, ext); +#endif + + if (set->ops == &nft_set_rbtree_type.ops) + return nft_rbtree_lookup(net, set, key, ext); + + WARN_ON_ONCE(1); + return set->ops->lookup(net, set, key, ext); +} +EXPORT_SYMBOL_GPL(nft_set_do_lookup); +#endif + void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -33,8 +64,8 @@ void nft_lookup_eval(const struct nft_expr *expr, const struct net *net = nft_net(pkt); bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 0840c635b752..be1595d6979d 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -201,7 +201,9 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: - return -EAFNOSUPPORT; + if (tb[NFTA_NAT_REG_ADDR_MIN]) + return -EAFNOSUPPORT; + break; } priv->family = family; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7e47edee88ee..94b2327e71dc 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -9,7 +9,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) @@ -110,7 +110,7 @@ static void nft_objref_map_eval(const struct nft_expr *expr, struct nft_object *obj; bool found; - found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext); + found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ac61f708b82d..d82677e83400 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nf_osf_data data; struct tcphdr _tcph; + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 501c5b24cc39..a44b14f6c0dc 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -110,7 +110,7 @@ void nft_payload_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -507,7 +507,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: - if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; fallthrough; case IPPROTO_UDPLITE: @@ -520,7 +520,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return -1; } - *l4csum_offset += pkt->xt.thoff; + *l4csum_offset += nft_thoff(pkt); return 0; } @@ -612,7 +612,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -643,7 +643,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { - if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + if (nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 95090186ee90..554caf967baa 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: @@ -45,7 +45,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 2a81ea421819..e7ae5914971e 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -73,8 +73,9 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) return (bitmap[idx] & (0x3 << off)) & (genmask << off); } -static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7b3d0a78c569..df40314de21f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -74,8 +74,9 @@ static const struct rhashtable_params nft_rhash_params = { .automatic_shrinking = true, }; -static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -446,8 +447,9 @@ struct nft_hash_elem { struct nft_set_ext ext; }; -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -484,9 +486,10 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, return ERR_PTR(-ENOENT); } -static bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index d84afb8fa79a..25a75591583e 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -178,8 +178,6 @@ struct nft_pipapo_elem { int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, union nft_pipapo_map_bucket *mt, bool match_only); -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index eabdb8d552ee..e517663e0cd1 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -142,7 +142,6 @@ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers - * @len: Length of bitmap in longs * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage @@ -1109,7 +1108,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. @@ -1136,8 +1135,13 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - /* This also protects access to all data related to scratch maps */ - kernel_fpu_begin(); + /* This also protects access to all data related to scratch maps. + * + * Note that we don't need a valid MXCSR state for any of the + * operations we use here, so pass 0 as mask and spare a LDMXCSR + * instruction. + */ + kernel_fpu_begin_mask(0); scratch = *raw_cpu_ptr(m->scratch_aligned); if (unlikely(!scratch)) { diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index 394bcb704db7..dbb6aaca8a7a 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,6 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9e36eb4a7429..d600a566da32 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -107,8 +107,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set return false; } -static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +INDIRECT_CALLABLE_SCOPE +bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 4fda8b3f1762..a0109fa1e92d 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; @@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, return; } - tcp = skb_header_pointer(skb, pkt->xt.thoff, + tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index accef672088c..b5b09a902c7a 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { + regs->verdict.code = NFT_BREAK; + return; + } + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; @@ -82,16 +88,17 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; + struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; memset(&taddr, 0, sizeof(taddr)); - if (!pkt->tprot_set) { + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 84e58ee501a4..25524e393349 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -39,6 +39,20 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) +struct xt_template { + struct list_head list; + + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); + + struct module *me; + + /* A unique name... */ + char name[XT_TABLE_MAXNAMELEN]; +}; + +static struct list_head xt_templates[NFPROTO_NUMPROTO]; + struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; }; @@ -1221,48 +1235,43 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); - struct xt_table *t, *found = NULL; + struct module *owner = NULL; + struct xt_template *tmpl; + struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; - if (net == &init_net) - goto out; - - /* Table doesn't exist in this netns, re-try init */ - xt_net = net_generic(&init_net, xt_pernet_id); - list_for_each_entry(t, &xt_net->tables[af], list) { + /* Table doesn't exist in this netns, check larval list */ + list_for_each_entry(tmpl, &xt_templates[af], list) { int err; - if (strcmp(t->name, name)) + if (strcmp(tmpl->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(tmpl->me)) goto out; + + owner = tmpl->me; + mutex_unlock(&xt[af].mutex); - err = t->table_init(net); + err = tmpl->table_init(net); if (err < 0) { - module_put(t->me); + module_put(owner); return ERR_PTR(err); } - found = t; - mutex_lock(&xt[af].mutex); break; } - if (!found) - goto out; - - xt_net = net_generic(net, xt_pernet_id); /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0) return t; - module_put(found->me); + module_put(owner); out: mutex_unlock(&xt[af].mutex); return ERR_PTR(-ENOENT); @@ -1749,6 +1758,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) } EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); +int xt_register_template(const struct xt_table *table, + int (*table_init)(struct net *net)) +{ + int ret = -EEXIST, af = table->af; + struct xt_template *t; + + mutex_lock(&xt[af].mutex); + + list_for_each_entry(t, &xt_templates[af], list) { + if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0)) + goto out_unlock; + } + + ret = -ENOMEM; + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + goto out_unlock; + + BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name)); + + strscpy(t->name, table->name, sizeof(t->name)); + t->table_init = table_init; + t->me = table->me; + list_add(&t->list, &xt_templates[af]); + ret = 0; +out_unlock: + mutex_unlock(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xt_register_template); + +void xt_unregister_template(const struct xt_table *table) +{ + struct xt_template *t; + int af = table->af; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_templates[af], list) { + if (strcmp(table->name, t->name)) + continue; + + list_del(&t->list); + mutex_unlock(&xt[af].mutex); + kfree(t); + return; + } + + mutex_unlock(&xt[af].mutex); + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(xt_unregister_template); + int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS @@ -1937,6 +1998,7 @@ static int __init xt_init(void) #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); + INIT_LIST_HEAD(&xt_templates[i]); } rv = register_pernet_subsys(&xt_net_ops); if (rv < 0) diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 9cdc16b0d0d8..b6a015aee0ce 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par) const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { - pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n", + pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index d4deee39158b..0a913ce07425 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -172,7 +172,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err2; } - ret = 0; if ((info->ct_events || info->exp_events) && !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, GFP_KERNEL)) { @@ -352,21 +351,10 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int notrack_chk(const struct xt_tgchk_param *par) -{ - if (!par->net->xt.notrack_deprecated_warning) { - pr_info("netfilter: NOTRACK target is deprecated, " - "use CT instead or upgrade iptables\n"); - par->net->xt.notrack_deprecated_warning = true; - } - return 0; -} - static struct xt_target notrack_tg_reg __read_mostly = { .name = "NOTRACK", .revision = 0, .family = NFPROTO_UNSPEC, - .checkentry = notrack_chk, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 24d4afb9988d..8b4fd27857f2 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -8,16 +8,14 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> struct xt_limit_priv { - spinlock_t lock; unsigned long prev; - uint32_t credit; + u32 credit; }; MODULE_LICENSE("GPL"); @@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv = r->master; - unsigned long now = jiffies; - - spin_lock_bh(&priv->lock); - priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; - if (priv->credit > r->credit_cap) - priv->credit = r->credit_cap; - - if (priv->credit >= r->cost) { - /* We're not limited. */ - priv->credit -= r->cost; - spin_unlock_bh(&priv->lock); - return true; - } - - spin_unlock_bh(&priv->lock); - return false; + unsigned long now; + u32 old_credit, new_credit, credit_increase = 0; + bool ret; + + /* fastpath if there is nothing to update */ + if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies)) + return false; + + do { + now = jiffies; + credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + old_credit = READ_ONCE(priv->credit); + new_credit = old_credit; + new_credit += credit_increase; + if (new_credit > r->credit_cap) + new_credit = r->credit_cap; + if (new_credit >= r->cost) { + ret = true; + new_credit -= r->cost; + } else { + ret = false; + } + } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit); + + return ret; } /* Precision saver. */ @@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par) r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } - spin_lock_init(&priv->lock); return 0; } diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index f28c8947c730..91a19c3ea1a3 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -105,7 +105,7 @@ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); @@ -287,7 +287,7 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 4f50a64315cf..894e6b8f1a86 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -144,8 +144,8 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { - ret_val = -ENOMEM; - goto add_std_failure; + kfree(doi_def); + return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; @@ -187,14 +187,14 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -263,7 +263,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -271,7 +271,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -410,7 +410,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); @@ -709,7 +709,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index dc8c39f51f7d..8158a25972b4 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -929,7 +929,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, * @cb_arg: argument for the callback function * * Description: - * Interate over the domain mapping hash table, skipping the first @skip_bkt + * Iterate over the domain mapping hash table, skipping the first @skip_bkt * buckets and @skip_chain entries. For each entry in the table call * @callback, if @callback returns a negative value stop 'walking' through the * table and return. Updates the values in @skip_bkt and @skip_chain on diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 5e1239cef000..beb0e573266d 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -719,7 +719,7 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) * it in @bitmap. The @offset must be aligned to an unsigned long and will be * updated on return if different from what was requested; if the catmap is * empty at the requested offset and beyond, the @offset is set to (u32)-1. - * Returns zero on sucess, negative values on failure. + * Returns zero on success, negative values on failure. * */ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index ca52f5085989..032b7d7b32c7 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -76,6 +76,7 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { static int netlbl_mgmt_add_common(struct genl_info *info, struct netlbl_audit *audit_info) { + void *pmap = NULL; int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; @@ -175,6 +176,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = addr->s_addr & mask->s_addr; map->list.mask = mask->s_addr; map->list.valid = 1; @@ -183,10 +185,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.cipso = cipsov4; ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -223,6 +223,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_addrmap; } + pmap = map; map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; @@ -235,10 +236,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); - if (ret_val != 0) { - kfree(map); - goto add_free_addrmap; - } + if (ret_val != 0) + goto add_free_map; entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; @@ -248,10 +247,12 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) - goto add_free_addrmap; + goto add_free_map; return 0; +add_free_map: + kfree(pmap); add_free_addrmap: kfree(addrmap); add_doi_put_def: @@ -434,7 +435,7 @@ static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -457,7 +458,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NLBL_MGMT_A_DOMAIN]) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); @@ -557,7 +558,7 @@ static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_mgmt_add_common(info, &audit_info); } @@ -576,7 +577,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) { struct netlbl_audit audit_info; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 3e6ac9b790b1..566ba4397ee4 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -492,8 +492,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { @@ -553,8 +552,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { @@ -814,7 +812,7 @@ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } @@ -897,7 +895,7 @@ static int netlbl_unlabel_staticadd(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -947,7 +945,7 @@ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -994,7 +992,7 @@ static int netlbl_unlabel_staticremove(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) @@ -1034,7 +1032,7 @@ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; - netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index b9ba8112b3c5..6190cbf94bf0 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -28,11 +28,9 @@ /** * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg - * @skb: the packet * @audit_info: NetLabel audit information */ -static inline void netlbl_netlink_auditinfo(struct sk_buff *skb, - struct netlbl_audit *audit_info) +static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { security_task_getsecid_subj(current, &audit_info->secid); audit_info->loginuid = audit_get_loginuid(current); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6133e412b948..24b7cf447bc5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -351,7 +351,7 @@ static void netlink_overrun(struct sock *sk) if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); + sk_error_report(sk); } } atomic_inc(&sk->sk_drops); @@ -1576,7 +1576,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) } sk->sk_err = p->code; - sk->sk_error_report(sk); + sk_error_report(sk); out: return ret; } @@ -2012,7 +2012,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ret = netlink_dump(sk); if (ret) { sk->sk_err = -ret; - sk->sk_error_report(sk); + sk_error_report(sk); } } @@ -2439,7 +2439,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; - NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk); + sk_error_report(NETLINK_CB(in_skb).sk); return; } @@ -2471,7 +2471,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, nlmsg_end(skb, rep); - netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); + nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); } EXPORT_SYMBOL(netlink_ack); @@ -2545,13 +2545,15 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); + if (err == -ESRCH) + err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); - if (!err || err == -ESRCH) + if (!err) err = err2; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2d6fdf40df66..1afca2a6c2ac 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -40,14 +40,6 @@ void genl_unlock(void) } EXPORT_SYMBOL(genl_unlock); -#ifdef CONFIG_LOCKDEP -bool lockdep_genl_is_held(void) -{ - return lockdep_is_held(&genl_mutex); -} -EXPORT_SYMBOL(lockdep_genl_is_held); -#endif - static void genl_lock_all(void) { down_write(&cb_lock); @@ -1485,6 +1477,7 @@ int genlmsg_multicast_allns(const struct genl_family *family, { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; + group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group, flags); } @@ -1495,14 +1488,12 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - int report = 0; - - if (info->nlhdr) - report = nlmsg_report(info->nlhdr); if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; + group = family->mcgrp_offset + group; - nlmsg_notify(sk, skb, info->snd_portid, group, report, flags); + nlmsg_notify(sk, skb, info->snd_portid, group, + nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify); diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index a880dd33e901..511819fbfa67 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -59,8 +59,7 @@ static void nr_loopback_timer(struct timer_list *unused) if (dev == NULL || nr_rx_frame(skb, dev) == 0) kfree_skb(skb); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) mod_timer(&loopback_timer, jiffies + 10); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 78da5eab252a..ddd5cbd455e3 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -266,6 +266,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, fallthrough; case 2: re_sort_routes(nr_node, 0, 1); + break; case 1: break; } @@ -359,6 +360,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n fallthrough; case 1: nr_node->routes[1] = nr_node->routes[2]; + fallthrough; case 2: break; } @@ -482,6 +484,7 @@ static int nr_dec_obs(void) fallthrough; case 1: s->routes[1] = s->routes[2]; + break; case 2: break; } @@ -529,6 +532,7 @@ void nr_rt_device_down(struct net_device *dev) fallthrough; case 1: t->routes[1] = t->routes[2]; + break; case 2: break; } @@ -578,8 +582,7 @@ struct net_device *nr_dev_first(void) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } - if (first) - dev_hold(first); + dev_hold(first); rcu_read_unlock(); return first; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 9115f8a7dd45..a8da88db7893 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,11 +121,9 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { - sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); - sock_put(sk); - return; + goto out; } break; @@ -146,6 +144,8 @@ static void nr_heartbeat_expiry(struct timer_list *t) nr_start_heartbeat(sk); bh_unlock_sock(sk); +out: + sock_put(sk); } static void nr_t2timer_expiry(struct timer_list *t) @@ -159,6 +159,7 @@ static void nr_t2timer_expiry(struct timer_list *t) nr_enquiry_response(sk); } bh_unlock_sock(sk); + sock_put(sk); } static void nr_t4timer_expiry(struct timer_list *t) @@ -169,6 +170,7 @@ static void nr_t4timer_expiry(struct timer_list *t) bh_lock_sock(sk); nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; bh_unlock_sock(sk); + sock_put(sk); } static void nr_idletimer_expiry(struct timer_list *t) @@ -197,6 +199,7 @@ static void nr_idletimer_expiry(struct timer_list *t) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } static void nr_t1timer_expiry(struct timer_list *t) @@ -209,8 +212,7 @@ static void nr_t1timer_expiry(struct timer_list *t) case NR_STATE_1: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_write_internal(sk, NR_CONNREQ); @@ -220,8 +222,7 @@ static void nr_t1timer_expiry(struct timer_list *t) case NR_STATE_2: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_write_internal(sk, NR_DISCREQ); @@ -231,8 +232,7 @@ static void nr_t1timer_expiry(struct timer_list *t) case NR_STATE_3: if (nr->n2count == nr->n2) { nr_disconnect(sk, ETIMEDOUT); - bh_unlock_sock(sk); - return; + goto out; } else { nr->n2count++; nr_requeue_frames(sk); @@ -241,5 +241,7 @@ static void nr_t1timer_expiry(struct timer_list *t) } nr_start_t1timer(sk); +out: bh_unlock_sock(sk); + sock_put(sk); } diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 4a9e72073564..6024fad905ff 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -79,7 +79,7 @@ int __init af_nfc_init(void) return sock_register(&nfc_sock_family_ops); } -void af_nfc_exit(void) +void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); } diff --git a/net/nfc/core.c b/net/nfc/core.c index 573c80c6ff7a..3c645c1d99c9 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -636,7 +636,7 @@ error: return rc; } -int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); @@ -665,7 +665,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, - u8 *gb, size_t gb_len) + const u8 *gb, size_t gb_len) { int rc; @@ -824,7 +824,7 @@ EXPORT_SYMBOL(nfc_targets_found); */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { - struct nfc_target *tg; + const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); @@ -1048,7 +1048,7 @@ struct nfc_dev *nfc_get_device(unsigned int idx) * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ -struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, +struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 5044c7db577e..fefc03674f4f 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -732,7 +732,7 @@ exit: return rc; } -static struct nfc_ops digital_nfc_ops = { +static const struct nfc_ops digital_nfc_ops = { .dev_up = digital_dev_up, .dev_down = digital_dev_down, .start_poll = digital_start_poll, @@ -745,7 +745,7 @@ static struct nfc_ops digital_nfc_ops = { .im_transceive = digital_in_send, }; -struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, +struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops, __u32 supported_protocols, __u32 driver_capabilities, int tx_headroom, int tx_tailroom) diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index e02b9befce0b..3a89bd9b89fc 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -34,7 +34,7 @@ static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, * HCI command execution completion callback. * err will be a standard linux error (may be converted from HCI response) * skb contains the response data and must be disposed, or may be NULL if - * an error occured + * an error occurred */ static void nfc_hci_execute_cb(void *context, struct sk_buff *skb, int err) { diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 43811b5219b5..ceb87db57cdb 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -128,7 +128,7 @@ static void nfc_hci_msg_rx_work(struct work_struct *work) struct nfc_hci_dev *hdev = container_of(work, struct nfc_hci_dev, msg_rx_work); struct sk_buff *skb; - struct hcp_message *message; + const struct hcp_message *message; u8 pipe; u8 type; u8 instruction; @@ -182,9 +182,9 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 status = NFC_HCI_ANY_OK; - struct hci_create_pipe_resp *create_info; - struct hci_delete_pipe_noti *delete_info; - struct hci_all_pipe_cleared_noti *cleared_info; + const struct hci_create_pipe_resp *create_info; + const struct hci_delete_pipe_noti *delete_info; + const struct hci_all_pipe_cleared_noti *cleared_info; u8 gate; pr_debug("from pipe %x cmd %x\n", pipe, cmd); @@ -447,7 +447,7 @@ static void nfc_hci_cmd_timeout(struct timer_list *t) } static int hci_dev_connect_gates(struct nfc_hci_dev *hdev, u8 gate_count, - struct nfc_hci_gate *gates) + const struct nfc_hci_gate *gates) { int r; while (gate_count--) { @@ -705,7 +705,7 @@ static void hci_transceive_cb(void *context, struct sk_buff *skb, int err) /* * TODO: Check RF Error indicator to make sure data is valid. * It seems that HCI cmd can complete without error, but data - * can be invalid if an RF error occured? Ignore for now. + * can be invalid if an RF error occurred? Ignore for now. */ if (err == 0) skb_trim(skb, skb->len - 1); /* RF Err ind */ @@ -928,7 +928,7 @@ static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return hdev->ops->fw_download(hdev, firmware_name); } -static struct nfc_ops hci_nfc_ops = { +static const struct nfc_ops hci_nfc_ops = { .dev_up = hci_dev_up, .dev_down = hci_dev_down, .start_poll = hci_start_poll, @@ -947,7 +947,7 @@ static struct nfc_ops hci_nfc_ops = { .se_io = hci_se_io, }; -struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, +struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops, struct nfc_hci_init_data *init_data, unsigned long quirks, u32 protocols, diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 6ab40ea17662..2140f6724644 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -11,7 +11,7 @@ static LIST_HEAD(llc_engines); -int nfc_llc_init(void) +int __init nfc_llc_init(void) { int r; @@ -41,7 +41,7 @@ void nfc_llc_exit(void) } } -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops) +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) { struct nfc_llc_engine *llc_engine; diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index 823ddb621e5d..d66271d211a5 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -26,20 +26,20 @@ struct nfc_llc_ops { struct nfc_llc_engine { const char *name; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; struct list_head entry; }; struct nfc_llc { void *data; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; int rx_headroom; int rx_tailroom; }; void *nfc_llc_get_data(struct nfc_llc *llc); -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops); +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops); void nfc_llc_unregister(const char *name); int nfc_llc_nop_register(void); diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index a42852f36f2e..a58716f16954 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -71,7 +71,7 @@ static int llc_nop_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return llc_nop->xmit_to_drv(llc_nop->hdev, skb); } -static struct nfc_llc_ops llc_nop_ops = { +static const struct nfc_llc_ops llc_nop_ops = { .init = llc_nop_init, .deinit = llc_nop_deinit, .start = llc_nop_start, diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index c0c8fea3a186..aef750d7787c 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -123,7 +123,7 @@ static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z) return ((y >= x) || (y < z)) ? true : false; } -static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, +static struct sk_buff *llc_shdlc_alloc_skb(const struct llc_shdlc *shdlc, int payload_len) { struct sk_buff *skb; @@ -137,7 +137,7 @@ static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, } /* immediately sends an S frame. */ -static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_s_frame(const struct llc_shdlc *shdlc, enum sframe_type sframe_type, int nr) { int r; @@ -159,7 +159,7 @@ static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, } /* immediately sends an U frame. skb may contain optional payload */ -static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_u_frame(const struct llc_shdlc *shdlc, struct sk_buff *skb, enum uframe_modifier uframe_modifier) { @@ -361,7 +361,7 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r) wake_up(shdlc->connect_wq); } -static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc) { struct sk_buff *skb; @@ -377,7 +377,7 @@ static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET); } -static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc) { struct sk_buff *skb; @@ -406,7 +406,7 @@ static void llc_shdlc_rcv_u_frame(struct llc_shdlc *shdlc, case SHDLC_NEGOTIATING: case SHDLC_CONNECTING: /* - * We sent RSET, but chip wants to negociate or we + * We sent RSET, but chip wants to negotiate or we * got RSET before we managed to send out our. */ if (skb->len > 0) @@ -820,7 +820,7 @@ static int llc_shdlc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return 0; } -static struct nfc_llc_ops llc_shdlc_ops = { +static const struct nfc_llc_ops llc_shdlc_ops = { .init = llc_shdlc_init, .deinit = llc_shdlc_deinit, .start = llc_shdlc_start, diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index 97853c9cefc7..d49d4bf2e37c 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -221,15 +221,15 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock); /* TLV API */ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); /* Commands API */ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length); +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length); struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap); -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len); void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 475061c79c44..3c4172a5aeb5 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -15,7 +15,7 @@ #include "nfc.h" #include "llcp.h" -static u8 llcp_tlv_length[LLCP_TLV_MAX] = { +static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ @@ -29,7 +29,7 @@ static u8 llcp_tlv_length[LLCP_TLV_MAX] = { }; -static u8 llcp_tlv8(u8 *tlv, u8 type) +static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -37,7 +37,7 @@ static u8 llcp_tlv8(u8 *tlv, u8 type) return tlv[2]; } -static u16 llcp_tlv16(u8 *tlv, u8 type) +static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -46,37 +46,37 @@ static u16 llcp_tlv16(u8 *tlv, u8 type) } -static u8 llcp_tlv_version(u8 *tlv) +static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } -static u16 llcp_tlv_miux(u8 *tlv) +static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } -static u16 llcp_tlv_wks(u8 *tlv) +static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } -static u16 llcp_tlv_lto(u8 *tlv) +static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } -static u8 llcp_tlv_opt(u8 *tlv) +static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } -static u8 llcp_tlv_rw(u8 *tlv) +static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length) +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; @@ -130,7 +130,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) return sdres; } -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; @@ -190,9 +190,10 @@ void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -239,9 +240,10 @@ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -295,7 +297,7 @@ static struct sk_buff *llcp_add_header(struct sk_buff *pdu, return pdu; } -static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv, +static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ @@ -389,9 +391,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *service_name_tlv = NULL, service_name_tlv_length; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *service_name_tlv = NULL; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 service_name_tlv_length, miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; @@ -465,8 +468,9 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index cc997518f79d..eaeb2b1cfa6a 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -301,7 +301,7 @@ static char *wks[] = { "urn:nfc:sn:snep", }; -static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) +static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; @@ -325,7 +325,7 @@ static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; @@ -522,7 +522,7 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; - u8 *version_tlv = NULL, *lto_tlv = NULL, + const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; @@ -612,7 +612,7 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) return local->gb; } -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; @@ -639,27 +639,27 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) local->remote_gb_len - 3); } -static u8 nfc_llcp_dsap(struct sk_buff *pdu) +static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } -static u8 nfc_llcp_ptype(struct sk_buff *pdu) +static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } -static u8 nfc_llcp_ssap(struct sk_buff *pdu) +static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } -static u8 nfc_llcp_ns(struct sk_buff *pdu) +static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } -static u8 nfc_llcp_nr(struct sk_buff *pdu) +static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } @@ -801,7 +801,7 @@ out: } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct nfc_llcp_sock *llcp_sock; @@ -815,9 +815,10 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, return llcp_sock; } -static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len) +static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { - u8 *tlv = &skb->data[2], type, length; + u8 type, length; + const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { @@ -875,7 +876,7 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; @@ -893,7 +894,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, goto fail; } } else { - u8 *sn; + const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); @@ -1112,7 +1113,7 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1155,7 +1156,8 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1188,7 +1190,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1226,12 +1229,13 @@ static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; - u8 dsap, ssap, *tlv, type, length, tid, sap; + u8 dsap, ssap, type, length, tid, sap; + const u8 *tlv; u16 tlv_len, offset; - char *service_name; + const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index da7fe9db1b00..82ab39d80726 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -53,9 +53,9 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, } int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, - struct dest_spec_params *params) + const struct dest_spec_params *params) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { if (conn_info->dest_type == dest_type) { @@ -95,8 +95,8 @@ static void nci_req_cancel(struct nci_dev *ndev, int err) /* Execute request and wait for completion. */ static int __nci_request(struct nci_dev *ndev, - void (*req)(struct nci_dev *ndev, unsigned long opt), - unsigned long opt, __u32 timeout) + void (*req)(struct nci_dev *ndev, const void *opt), + const void *opt, __u32 timeout) { int rc = 0; long completion_rc; @@ -139,8 +139,8 @@ static int __nci_request(struct nci_dev *ndev, inline int nci_request(struct nci_dev *ndev, void (*req)(struct nci_dev *ndev, - unsigned long opt), - unsigned long opt, __u32 timeout) + const void *opt), + const void *opt, __u32 timeout) { int rc; @@ -155,7 +155,7 @@ inline int nci_request(struct nci_dev *ndev, return rc; } -static void nci_reset_req(struct nci_dev *ndev, unsigned long opt) +static void nci_reset_req(struct nci_dev *ndev, const void *opt) { struct nci_core_reset_cmd cmd; @@ -163,17 +163,17 @@ static void nci_reset_req(struct nci_dev *ndev, unsigned long opt) nci_send_cmd(ndev, NCI_OP_CORE_RESET_CMD, 1, &cmd); } -static void nci_init_req(struct nci_dev *ndev, unsigned long opt) +static void nci_init_req(struct nci_dev *ndev, const void *opt) { u8 plen = 0; if (opt) plen = sizeof(struct nci_core_init_v2_cmd); - nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, (void *)opt); + nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, opt); } -static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) +static void nci_init_complete_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_disc_map_cmd cmd; struct disc_map_config *cfg = cmd.mapping_configs; @@ -210,14 +210,14 @@ static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) } struct nci_set_config_param { - __u8 id; - size_t len; - __u8 *val; + __u8 id; + size_t len; + const __u8 *val; }; -static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt) +static void nci_set_config_req(struct nci_dev *ndev, const void *opt) { - struct nci_set_config_param *param = (struct nci_set_config_param *)opt; + const struct nci_set_config_param *param = opt; struct nci_core_set_config_cmd cmd; BUG_ON(param->len > NCI_MAX_PARAM_LEN); @@ -235,10 +235,9 @@ struct nci_rf_discover_param { __u32 tm_protocols; }; -static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_discover_req(struct nci_dev *ndev, const void *opt) { - struct nci_rf_discover_param *param = - (struct nci_rf_discover_param *)opt; + const struct nci_rf_discover_param *param = opt; struct nci_rf_disc_cmd cmd; cmd.num_disc_configs = 0; @@ -301,10 +300,9 @@ struct nci_rf_discover_select_param { __u8 rf_protocol; }; -static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_discover_select_req(struct nci_dev *ndev, const void *opt) { - struct nci_rf_discover_select_param *param = - (struct nci_rf_discover_select_param *)opt; + const struct nci_rf_discover_select_param *param = opt; struct nci_rf_discover_select_cmd cmd; cmd.rf_discovery_id = param->rf_discovery_id; @@ -328,11 +326,11 @@ static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt) sizeof(struct nci_rf_discover_select_cmd), &cmd); } -static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_deactivate_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_deactivate_cmd cmd; - cmd.type = opt; + cmd.type = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_RF_DEACTIVATE_CMD, sizeof(struct nci_rf_deactivate_cmd), &cmd); @@ -341,18 +339,17 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) struct nci_cmd_param { __u16 opcode; size_t len; - __u8 *payload; + const __u8 *payload; }; -static void nci_generic_req(struct nci_dev *ndev, unsigned long opt) +static void nci_generic_req(struct nci_dev *ndev, const void *opt) { - struct nci_cmd_param *param = - (struct nci_cmd_param *)opt; + const struct nci_cmd_param *param = opt; nci_send_cmd(ndev, param->opcode, param->len, param->payload); } -int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload) { struct nci_cmd_param param; @@ -360,12 +357,13 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) param.len = len; param.payload = payload; - return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_prop_cmd); -int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) +int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, + const __u8 *payload) { struct nci_cmd_param param; @@ -373,21 +371,21 @@ int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) param.len = len; param.payload = payload; - return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_cmd); int nci_core_reset(struct nci_dev *ndev) { - return __nci_request(ndev, nci_reset_req, 0, + return __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } EXPORT_SYMBOL(nci_core_reset); int nci_core_init(struct nci_dev *ndev) { - return __nci_request(ndev, nci_init_req, 0, + return __nci_request(ndev, nci_init_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } EXPORT_SYMBOL(nci_core_init); @@ -397,9 +395,9 @@ struct nci_loopback_data { struct sk_buff *data; }; -static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) +static void nci_send_data_req(struct nci_dev *ndev, const void *opt) { - struct nci_loopback_data *data = (struct nci_loopback_data *)opt; + const struct nci_loopback_data *data = opt; nci_send_data(ndev, data->conn_id, data->data); } @@ -407,7 +405,7 @@ static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); if (!conn_info) { @@ -420,7 +418,7 @@ static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) nci_req_complete(ndev, NCI_STATUS_OK); } -int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, +int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len, struct sk_buff **resp) { int r; @@ -460,7 +458,7 @@ int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, loopback_data.data = skb; ndev->cur_conn_id = conn_id; - r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data, + r = nci_request(ndev, nci_send_data_req, &loopback_data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK && resp) *resp = conn_info->rx_skb; @@ -493,7 +491,7 @@ static int nci_open_device(struct nci_dev *ndev) rc = ndev->ops->init(ndev); if (!rc) { - rc = __nci_request(ndev, nci_reset_req, 0, + rc = __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } @@ -506,10 +504,10 @@ static int nci_open_device(struct nci_dev *ndev) .feature1 = NCI_FEATURE_DISABLE, .feature2 = NCI_FEATURE_DISABLE }; - unsigned long opt = 0; + const void *opt = NULL; if (ndev->nci_ver & NCI_VER_2_MASK) - opt = (unsigned long)&nci_init_v2_cmd; + opt = &nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, msecs_to_jiffies(NCI_INIT_TIMEOUT)); @@ -519,7 +517,7 @@ static int nci_open_device(struct nci_dev *ndev) rc = ndev->ops->post_setup(ndev); if (!rc) { - rc = __nci_request(ndev, nci_init_complete_req, 0, + rc = __nci_request(ndev, nci_init_complete_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } @@ -569,7 +567,7 @@ static int nci_close_device(struct nci_dev *ndev) atomic_set(&ndev->cmd_cnt, 1); set_bit(NCI_INIT, &ndev->flags); - __nci_request(ndev, nci_reset_req, 0, + __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); /* After this point our queues are empty @@ -624,7 +622,7 @@ static int nci_dev_down(struct nfc_dev *nfc_dev) return nci_close_device(ndev); } -int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) +int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val) { struct nci_set_config_param param; @@ -635,15 +633,15 @@ int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) param.len = len; param.val = val; - return __nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + return __nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } EXPORT_SYMBOL(nci_set_config); -static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt) +static void nci_nfcee_discover_req(struct nci_dev *ndev, const void *opt) { struct nci_nfcee_discover_cmd cmd; - __u8 action = opt; + __u8 action = (unsigned long)opt; cmd.discovery_action = action; @@ -652,15 +650,16 @@ static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt) int nci_nfcee_discover(struct nci_dev *ndev, u8 action) { - return __nci_request(ndev, nci_nfcee_discover_req, action, + unsigned long opt = action; + + return __nci_request(ndev, nci_nfcee_discover_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_discover); -static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt) +static void nci_nfcee_mode_set_req(struct nci_dev *ndev, const void *opt) { - struct nci_nfcee_mode_set_cmd *cmd = - (struct nci_nfcee_mode_set_cmd *)opt; + const struct nci_nfcee_mode_set_cmd *cmd = opt; nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD, sizeof(struct nci_nfcee_mode_set_cmd), cmd); @@ -673,16 +672,14 @@ int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode) cmd.nfcee_id = nfcee_id; cmd.nfcee_mode = nfcee_mode; - return __nci_request(ndev, nci_nfcee_mode_set_req, - (unsigned long)&cmd, + return __nci_request(ndev, nci_nfcee_mode_set_req, &cmd, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_mode_set); -static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) +static void nci_core_conn_create_req(struct nci_dev *ndev, const void *opt) { - struct core_conn_create_data *data = - (struct core_conn_create_data *)opt; + const struct core_conn_create_data *data = opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd); } @@ -690,7 +687,7 @@ static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, u8 number_destination_params, size_t params_len, - struct core_conn_create_dest_spec_params *params) + const struct core_conn_create_dest_spec_params *params) { int r; struct nci_core_conn_create_cmd *cmd; @@ -719,24 +716,26 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, } ndev->cur_dest_type = destination_type; - r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data, + r = __nci_request(ndev, nci_core_conn_create_req, &data, msecs_to_jiffies(NCI_CMD_TIMEOUT)); kfree(cmd); return r; } EXPORT_SYMBOL(nci_core_conn_create); -static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt) +static void nci_core_conn_close_req(struct nci_dev *ndev, const void *opt) { - __u8 conn_id = opt; + __u8 conn_id = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CLOSE_CMD, 1, &conn_id); } int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { + unsigned long opt = conn_id; + ndev->cur_conn_id = conn_id; - return __nci_request(ndev, nci_core_conn_close_req, conn_id, + return __nci_request(ndev, nci_core_conn_close_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_conn_close); @@ -756,14 +755,14 @@ static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) param.id = NCI_PN_ATR_REQ_GEN_BYTES; - rc = nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + rc = nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); if (rc) return rc; param.id = NCI_LN_ATR_RES_GEN_BYTES; - return nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + return nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } @@ -813,7 +812,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev, pr_debug("target active or w4 select, implicitly deactivate\n"); rc = nci_request(ndev, nci_rf_deactivate_req, - NCI_DEACTIVATE_TYPE_IDLE_MODE, + (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); if (rc) return -EBUSY; @@ -835,7 +834,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev, param.im_protocols = im_protocols; param.tm_protocols = tm_protocols; - rc = nci_request(ndev, nci_rf_discover_req, (unsigned long)¶m, + rc = nci_request(ndev, nci_rf_discover_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_TIMEOUT)); if (!rc) @@ -854,7 +853,8 @@ static void nci_stop_poll(struct nfc_dev *nfc_dev) return; } - nci_request(ndev, nci_rf_deactivate_req, NCI_DEACTIVATE_TYPE_IDLE_MODE, + nci_request(ndev, nci_rf_deactivate_req, + (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } @@ -863,7 +863,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_rf_discover_select_param param; - struct nfc_target *nci_target = NULL; + const struct nfc_target *nci_target = NULL; int i; int rc = 0; @@ -913,8 +913,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, else param.rf_protocol = NCI_RF_PROTOCOL_NFC_DEP; - rc = nci_request(ndev, nci_rf_discover_select_req, - (unsigned long)¶m, + rc = nci_request(ndev, nci_rf_discover_select_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } @@ -929,7 +928,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, __u8 mode) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); - u8 nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; + unsigned long nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; pr_debug("entry\n"); @@ -947,7 +946,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, } if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { - nci_request(ndev, nci_rf_deactivate_req, nci_mode, + nci_request(ndev, nci_rf_deactivate_req, (void *)nci_mode, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } @@ -985,8 +984,8 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev) } else { if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE || atomic_read(&ndev->state) == NCI_DISCOVERY) { - nci_request(ndev, nci_rf_deactivate_req, 0, - msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); + nci_request(ndev, nci_rf_deactivate_req, (void *)0, + msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } rc = nfc_tm_deactivated(nfc_dev); @@ -1004,7 +1003,7 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; if (!conn_info) @@ -1102,7 +1101,7 @@ static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return ndev->ops->fw_download(ndev, firmware_name); } -static struct nfc_ops nci_nfc_ops = { +static const struct nfc_ops nci_nfc_ops = { .dev_up = nci_dev_up, .dev_down = nci_dev_down, .start_poll = nci_start_poll, @@ -1129,7 +1128,7 @@ static struct nfc_ops nci_nfc_ops = { * @tx_headroom: Reserved space at beginning of skb * @tx_tailroom: Reserved space at end of skb */ -struct nci_dev *nci_allocate_device(struct nci_ops *ops, +struct nci_dev *nci_allocate_device(const struct nci_ops *ops, __u32 supported_protocols, int tx_headroom, int tx_tailroom) { @@ -1152,8 +1151,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { pr_err("Too many proprietary commands: %zd\n", ops->n_prop_ops); - ops->prop_ops = NULL; - ops->n_prop_ops = 0; + goto free_nci; } ndev->tx_headroom = tx_headroom; @@ -1270,7 +1268,7 @@ EXPORT_SYMBOL(nci_register_device); */ void nci_unregister_device(struct nci_dev *ndev) { - struct nci_conn_info *conn_info, *n; + struct nci_conn_info *conn_info, *n; nci_close_device(ndev); @@ -1332,7 +1330,7 @@ int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) EXPORT_SYMBOL(nci_send_frame); /* Send NCI command */ -int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) +int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload) { struct nci_ctrl_hdr *hdr; struct sk_buff *skb; @@ -1364,12 +1362,12 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) EXPORT_SYMBOL(nci_send_cmd); /* Proprietary commands API */ -static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, - size_t n_ops, - __u16 opcode) +static const struct nci_driver_ops *ops_cmd_lookup(const struct nci_driver_ops *ops, + size_t n_ops, + __u16 opcode) { size_t i; - struct nci_driver_ops *op; + const struct nci_driver_ops *op; if (!ops || !n_ops) return NULL; @@ -1384,10 +1382,10 @@ static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, } static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, rsp_opcode); if (!op || !op->rsp) @@ -1397,10 +1395,10 @@ static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, } static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, ntf_opcode); if (!op || !op->ntf) @@ -1442,7 +1440,7 @@ int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, static void nci_tx_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, tx_work); - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index ce3382be937f..6055dc9a82aa 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -26,7 +26,7 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, __u8 conn_id, int err) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; data_exchange_cb_t cb; void *cb_context; @@ -80,7 +80,7 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev, int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) @@ -93,9 +93,9 @@ EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size); static int nci_queue_tx_data_frags(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int total_len = skb->len; - unsigned char *data = skb->data; + const unsigned char *data = skb->data; unsigned long flags; struct sk_buff_head frags_q; struct sk_buff *skb_frag; @@ -166,7 +166,7 @@ exit: /* Send NCI data */ int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int rc = 0; pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len); @@ -269,7 +269,7 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) __u8 pbf = nci_pbf(skb->data); __u8 status = 0; __u8 conn_id = nci_conn_id(skb->data); - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; pr_debug("len %d\n", skb->len); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 96865142104f..e199912ee1e5 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -16,11 +16,11 @@ #include <linux/nfc.h> struct nci_data { - u8 conn_id; - u8 pipe; - u8 cmd; - const u8 *data; - u32 data_len; + u8 conn_id; + u8 pipe; + u8 cmd; + const u8 *data; + u32 data_len; } __packed; struct nci_hci_create_pipe_params { @@ -142,7 +142,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; @@ -161,8 +161,6 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, *(u8 *)skb_push(skb, 1) = data_type; do { - len = conn_info->max_pkt_payload_len; - /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { @@ -197,9 +195,9 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, return i; } -static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt) +static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { - struct nci_data *data = (struct nci_data *)opt; + const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); @@ -223,8 +221,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -242,7 +240,7 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, data.data = param; data.data_len = param_len; - r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; @@ -365,7 +363,7 @@ exit: static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -408,7 +406,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work) struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; - struct nci_hcp_message *message; + const struct nci_hcp_message *message; u8 pipe, type, instruction; while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { @@ -500,7 +498,7 @@ void nci_hci_data_received_cb(void *context, int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -513,9 +511,8 @@ int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) data.data = NULL; data.data_len = 0; - return nci_request(ndev, nci_hci_send_data_req, - (unsigned long)&data, - msecs_to_jiffies(NCI_DATA_TIMEOUT)); + return nci_request(ndev, nci_hci_send_data_req, &data, + msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); @@ -525,7 +522,7 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; - struct nci_hci_create_pipe_resp *resp; + const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); @@ -559,8 +556,8 @@ static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; @@ -589,8 +586,7 @@ int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, data.data = tmp; data.data_len = param_len + 1; - r = nci_request(ndev, nci_hci_send_data_req, - (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; @@ -607,8 +603,8 @@ EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -629,7 +625,7 @@ int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, data.data = &idx; data.data_len = 1; - r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { @@ -699,7 +695,7 @@ EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, - struct nci_hci_gate *gates) + const struct nci_hci_gate *gates) { int r; @@ -716,7 +712,7 @@ static int nci_hci_dev_connect_gates(struct nci_dev *ndev, int nci_hci_dev_session_init(struct nci_dev *ndev) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; int r; diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 98af04c86b2c..c5eacaac41ae 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -28,10 +28,10 @@ /* Handle NCI Notification packets */ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf = (void *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -48,7 +48,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; int i; pr_debug("num_entries %d\n", ntf->num_entries); @@ -80,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, } static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -107,9 +107,10 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); } -static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfca_poll *nfca_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfca_poll *nfca_poll, + const __u8 *data) { nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; @@ -134,9 +135,10 @@ static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcb_poll *nfcb_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcb_poll *nfcb_poll, + const __u8 *data) { nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); @@ -148,9 +150,10 @@ static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_poll *nfcf_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_poll *nfcf_poll, + const __u8 *data) { nfcf_poll->bit_rate = *data++; nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); @@ -164,9 +167,10 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcv_poll *nfcv_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + const __u8 *data) { ++data; nfcv_poll->dsfid = *data++; @@ -175,9 +179,10 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_listen *nfcf_listen, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_listen *nfcf_listen, + const __u8 *data) { nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); @@ -198,12 +203,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, __u8 rf_tech_and_mode, - void *params) + const void *params) { - struct rf_tech_specific_params_nfca_poll *nfca_poll; - struct rf_tech_specific_params_nfcb_poll *nfcb_poll; - struct rf_tech_specific_params_nfcf_poll *nfcf_poll; - struct rf_tech_specific_params_nfcv_poll *nfcv_poll; + const struct rf_tech_specific_params_nfca_poll *nfca_poll; + const struct rf_tech_specific_params_nfcb_poll *nfcb_poll; + const struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + const struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) @@ -274,7 +279,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, } static void nci_add_new_target(struct nci_dev *ndev, - struct nci_rf_discover_ntf *ntf) + const struct nci_rf_discover_ntf *ntf) { struct nfc_target *target; int i, rc; @@ -319,10 +324,10 @@ void nci_clear_target_list(struct nci_dev *ndev) } static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; bool add_target = true; ntf.rf_discovery_id = *data++; @@ -382,7 +387,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_nfca_poll_iso_dep *nfca_poll; struct activation_params_nfcb_poll_iso_dep *nfcb_poll; @@ -418,7 +424,8 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, } static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_poll_nfc_dep *poll; struct activation_params_listen_nfc_dep *listen; @@ -454,7 +461,7 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, } static void nci_target_auto_activated(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { struct nfc_target *target; int rc; @@ -477,7 +484,7 @@ static void nci_target_auto_activated(struct nci_dev *ndev, } static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { ndev->remote_gb_len = 0; @@ -519,11 +526,11 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, } static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; int err = NCI_STATUS_OK; ntf.rf_discovery_id = *data++; @@ -681,10 +688,10 @@ listen: } static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; - struct nci_rf_deactivate_ntf *ntf = (void *) skb->data; + const struct nci_conn_info *conn_info; + const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); @@ -725,10 +732,10 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - struct nci_nfcee_discover_ntf *nfcee_ntf = + const struct nci_nfcee_discover_ntf *nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; pr_debug("\n"); @@ -745,7 +752,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, } static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { pr_debug("\n"); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index e9605922a322..a2e72c003805 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -25,9 +25,10 @@ /* Handle NCI Response packets */ -static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_reset_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_reset_rsp *rsp = (void *) skb->data; + const struct nci_core_reset_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -43,10 +44,11 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) } } -static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data; - struct nci_core_init_rsp_2 *rsp_2; + const struct nci_core_init_rsp_1 *rsp_1 = (void *)skb->data; + const struct nci_core_init_rsp_2 *rsp_2; pr_debug("status 0x%x\n", rsp_1->status); @@ -81,10 +83,11 @@ static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; - u8 *supported_rf_interface = rsp->supported_rf_interfaces; + const struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; + const u8 *supported_rf_interface = rsp->supported_rf_interfaces; u8 rf_interface_idx = 0; u8 rf_extension_cnt = 0; @@ -118,7 +121,7 @@ static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_init_rsp_packet(struct nci_dev *ndev, const struct sk_buff *skb) { u8 status = 0; @@ -160,9 +163,9 @@ exit: } static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_core_set_config_rsp *rsp = (void *) skb->data; + const struct nci_core_set_config_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -170,7 +173,7 @@ static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, } static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -179,9 +182,10 @@ static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, nci_req_complete(ndev, status); } -static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; __u8 status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -210,7 +214,7 @@ exit: } static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -222,7 +226,7 @@ static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, } static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -238,9 +242,9 @@ static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_nfcee_discover_rsp *discover_rsp; + const struct nci_nfcee_discover_rsp *discover_rsp; if (skb->len != 2) { nci_req_complete(ndev, NCI_STATUS_NFCEE_PROTOCOL_ERROR); @@ -255,7 +259,7 @@ static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -264,11 +268,11 @@ static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, } static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; struct nci_conn_info *conn_info = NULL; - struct nci_core_conn_create_rsp *rsp; + const struct nci_core_conn_create_rsp *rsp; pr_debug("status 0x%x\n", status); @@ -319,7 +323,7 @@ exit: } static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_conn_info *conn_info; __u8 status = skb->data[0]; diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 7d8e10e27c20..0935527d1d12 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -27,7 +27,7 @@ #define CRC_INIT 0xFFFF -static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb, +static int __nci_spi_send(struct nci_spi *nspi, const struct sk_buff *skb, int cs_change) { struct spi_message m; diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 1248faf4d6df..502e7a3f8948 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -308,7 +308,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, * Return Value: None */ static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, - char *flags, int count) + const char *flags, int count) { struct nci_uart *nu = (void *)tty->disc_data; @@ -442,6 +442,7 @@ EXPORT_SYMBOL_GPL(nci_uart_set_config); static struct tty_ldisc_ops nci_uart_ldisc = { .owner = THIS_MODULE, + .num = N_NCI, .name = "n_nci", .open = nci_uart_tty_open, .close = nci_uart_tty_close, @@ -456,12 +457,12 @@ static struct tty_ldisc_ops nci_uart_ldisc = { static int __init nci_uart_init(void) { - return tty_register_ldisc(N_NCI, &nci_uart_ldisc); + return tty_register_ldisc(&nci_uart_ldisc); } static void __exit nci_uart_exit(void) { - tty_unregister_ldisc(N_NCI); + tty_unregister_ldisc(&nci_uart_ldisc); } module_init(nci_uart_init); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 722f7ef891e1..49089c50872e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -530,7 +530,7 @@ free_msg: int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { - struct nfc_se *se; + const struct nfc_se *se; struct sk_buff *msg; void *hdr; @@ -1531,7 +1531,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; - struct nfc_vendor_cmd *cmd; + const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 889fefd64e56..de2ec66d7e83 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -48,7 +48,7 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_llcp_register_device(struct nfc_dev *dev); void nfc_llcp_unregister_device(struct nfc_dev *dev); -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len); +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len); u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len); int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5f1d438a0a23..0ca214ab5aef 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -49,7 +49,7 @@ static void rawsock_report_error(struct sock *sk, int err) sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_err = -err; - sk->sk_error_report(sk); + sk_error_report(sk); rawsock_write_queue_purge(sk); } @@ -140,7 +140,7 @@ static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb, { struct sock *sk = (struct sock *) context; - BUG_ON(in_irq()); + BUG_ON(in_hardirq()); pr_debug("sk=%p err=%d\n", sk, err); diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 41109c326f3a..28982630bef3 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,6 +13,7 @@ openvswitch-y := \ flow_netlink.o \ flow_table.o \ meter.o \ + openvswitch_trace.o \ vport.o \ vport-internal_dev.o \ vport-netdev.o @@ -24,3 +25,5 @@ endif obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o + +CFLAGS_openvswitch_trace.o = -I$(src) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 77d924ab8cdb..076774034bb9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -30,6 +30,7 @@ #include "conntrack.h" #include "vport.h" #include "flow_netlink.h" +#include "openvswitch_trace.h" struct deferred_action { struct sk_buff *skb; @@ -923,7 +924,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); + if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, + smp_processor_id()); + else + upcall.portid = nla_get_u32(a); break; case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { @@ -1242,6 +1249,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; + if (trace_ovs_do_execute_action_enabled()) + trace_ovs_do_execute_action(dp, skb, key, a, rem); + switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: { int port = nla_get_u32(a); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index cadb6a29b285..1b5eae57bc90 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -967,8 +967,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb_nfct(skb)) - nf_conntrack_put(skb_nfct(skb)); + nf_conntrack_put(skb_nfct(skb)); nf_conntrack_get(&tmpl->ct_general); nf_ct_set(skb, tmpl, IP_CT_NEW); } @@ -1329,11 +1328,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) { - if (skb_nfct(skb)) { - nf_conntrack_put(skb_nfct(skb)); - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key, false); - } + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key, false); return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9d6ef6cb9b26..67ad08320886 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -43,6 +43,7 @@ #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" +#include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -132,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, static void ovs_dp_masks_rebalance(struct work_struct *work); +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -165,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -238,7 +242,13 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) @@ -275,6 +285,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, struct dp_stats_percpu *stats; int err; + if (trace_ovs_dp_upcall_enabled()) + trace_ovs_dp_upcall(dp, skb, key, upcall_info); + if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; @@ -1590,16 +1603,70 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} + static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0; + int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | - OVS_DP_F_TC_RECIRC_SHARING)) + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) @@ -1620,6 +1687,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) dp->user_features = user_features; + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) static_branch_enable(&tc_recirc_sharing_support); else diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 38f7d3e66ca6..fcfe6cb46441 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -51,6 +51,21 @@ struct dp_stats_percpu { }; /** + * struct dp_nlsk_pids - array of netlink portids of for a datapath. + * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU + * is enabled and must be protected by rcu. + * @rcu: RCU callback head for deferred destruction. + * @n_pids: Size of @pids array. + * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets + * that miss the flow table. + */ +struct dp_nlsk_pids { + struct rcu_head rcu; + u32 n_pids; + u32 pids[]; +}; + +/** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. @@ -61,6 +76,7 @@ struct dp_stats_percpu { * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. + * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -87,6 +103,8 @@ struct datapath { /* Switch meters. */ struct dp_meter_table meter_tbl; + + struct dp_nlsk_pids __rcu *upcall_portids; }; /** @@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); + const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index e586424d8b04..9713035b89e3 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -293,14 +293,14 @@ static bool icmp6hdr_ok(struct sk_buff *skb) } /** - * Parse vlan tag from vlan header. + * parse_vlan_tag - Parse vlan tag from vlan header. * @skb: skb containing frame to parse * @key_vh: pointer to parsed vlan tag * @untag_vlan: should the vlan header be removed from the frame * - * Returns ERROR on memory error. - * Returns 0 if it encounters a non-vlan or incomplete packet. - * Returns 1 after successfully parsing vlan tag. + * Return: ERROR on memory error. + * %0 if it encounters a non-vlan or incomplete packet. + * %1 after successfully parsing vlan tag. */ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, bool untag_vlan) @@ -532,6 +532,7 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) * L3 header * @key: output flow key * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) { @@ -748,8 +749,6 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * The caller must ensure that skb->len >= ETH_HLEN. * - * Returns 0 if successful, otherwise a negative errno value. - * * Initializes @skb header fields as follows: * * - skb->mac_header: the L2 header. @@ -764,6 +763,8 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * - skb->protocol: the type of the data starting at skb->network_header. * Equals to key->eth.type. + * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index c89c8da99f1a..d4a2db0b2299 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -670,13 +670,13 @@ static bool cmp_key(const struct sw_flow_key *key1, { const long *cp1 = (const long *)((const u8 *)key1 + key_start); const long *cp2 = (const long *)((const u8 *)key2 + key_start); - long diffs = 0; int i; for (i = key_start; i < key_end; i += sizeof(long)) - diffs |= *cp1++ ^ *cp2++; + if (*cp1++ ^ *cp2++) + return false; - return diffs == 0; + return true; } static bool flow_cmp_masked_key(const struct sw_flow *flow, diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c new file mode 100644 index 000000000000..62c5f7d6f023 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* bug in tracepoint.h, it should include this */ +#include <linux/module.h> + +/* sparse isn't too happy with all macros... */ +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "openvswitch_trace.h" + +#endif diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h new file mode 100644 index 000000000000..3eb35d9eb700 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM openvswitch + +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OPENVSWITCH_H + +#include <linux/tracepoint.h> + +#include "datapath.h" + +TRACE_EVENT(ovs_do_execute_action, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *a, int rem), + + TP_ARGS(dp, skb, key, a, rem), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, action_type ) + __field( unsigned int, action_len ) + __field( void *, action_data ) + __field( u8, is_last ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->action_type = nla_type(a); + __entry->action_len = nla_len(a); + __entry->action_data = nla_data(a); + __entry->is_last = nla_is_last(a, rem); + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_Zone=%04x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->action_type, __entry->action_len, + __entry->action_data, __entry->is_last) +); + +TRACE_EVENT(ovs_dp_upcall, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info), + + TP_ARGS(dp, skb, key, upcall_info), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( const void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, upcall_cmd ) + __field( u32, upcall_port ) + __field( u16, upcall_mru ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name, ovs_dp_name(dp)); + __assign_str(dev_name, skb->dev->name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->upcall_cmd = upcall_info->cmd; + __entry->upcall_port = upcall_info->portid; + __entry->upcall_mru = upcall_info->mru; + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_zone=%04x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->upcall_cmd, __entry->upcall_port, + __entry->upcall_mru) +); + +#endif /* _TRACE_OPENVSWITCH_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE openvswitch_trace +#include <trace/define_trace.h> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 88deb5b41429..cf2ce5812489 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -507,6 +507,7 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) } skb->dev = vport->dev; + skb->tstamp = 0; vport->ops->send(skb); return; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 330ba68828e7..543365f58e97 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -250,8 +250,7 @@ static struct net_device *packet_cached_dev_get(struct packet_sock *po) rcu_read_lock(); dev = rcu_dereference(po->cached_dev); - if (likely(dev)) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; @@ -1656,6 +1655,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; + break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: @@ -3023,8 +3023,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) out_free: kfree_skb(skb); out_unlock: - if (dev) - dev_put(dev); + dev_put(dev); out: return err; } @@ -3157,8 +3156,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } } - if (dev) - dev_hold(dev); + dev_hold(dev); proto_curr = po->prot_hook.type; dev_curr = po->prot_hook.dev; @@ -3195,8 +3193,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, packet_cached_dev_assign(po, dev); } } - if (dev_curr) - dev_put(dev_curr); + dev_put(dev_curr); if (proto == 0 || !need_rehook) goto out_unlock; @@ -3206,7 +3203,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } out_unlock: @@ -3934,12 +3931,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, return -EFAULT; lock_sock(sk); - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { - ret = -EBUSY; - } else { + if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) po->tp_tx_has_off = !!val; - ret = 0; - } + release_sock(sk); return 0; } @@ -4106,13 +4100,12 @@ static int packet_notifier(struct notifier_block *this, __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); - if (po->prot_hook.dev) - dev_put(po->prot_hook.dev); + dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index ca6ae4c59433..65218b7ce9f9 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -275,8 +275,7 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb, drop: kfree_skb(skb); - if (dev) - dev_put(dev); + dev_put(dev); return err; } EXPORT_SYMBOL(pn_skb_send); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index ac0fae06cc15..cde671d29d5d 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -122,8 +122,7 @@ struct net_device *phonet_device_get(struct net *net) break; dev = NULL; } - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -233,11 +232,11 @@ static int phonet_device_autoconf(struct net_device *dev) struct if_phonet_req req; int ret; - if (!dev->netdev_ops->ndo_do_ioctl) + if (!dev->netdev_ops->ndo_siocdevprivate) return -EOPNOTSUPP; - ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req, - SIOCPNGAUTOCONF); + ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req, + NULL, SIOCPNGAUTOCONF); if (ret < 0) return ret; @@ -411,8 +410,7 @@ struct net_device *phonet_route_output(struct net *net, u8 daddr) daddr >>= 2; rcu_read_lock(); dev = rcu_dereference(routes->table[daddr]); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); if (!dev) diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2599235d592e..71e2caf6ab85 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -379,8 +379,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, saddr = PN_NO_ADDR; release_sock(sk); - if (dev) - dev_put(dev); + dev_put(dev); if (saddr == PN_NO_ADDR) return -EHOSTUNREACH; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 8d00dfe8139e..1990d496fcfc 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -775,8 +775,10 @@ int qrtr_ns_init(void) } qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1); - if (!qrtr_ns.workqueue) + if (!qrtr_ns.workqueue) { + ret = -ENOMEM; goto err_sock; + } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index f2efaa4225f9..ec2322529727 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -493,7 +493,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } - if (len != ALIGN(size, 4) + hdrlen) + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && @@ -506,8 +506,12 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ - const struct qrtr_ctrl_pkt *pkt = data + hdrlen; + const struct qrtr_ctrl_pkt *pkt; + if (size < sizeof(*pkt)) + goto err; + + pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } @@ -518,8 +522,10 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (!ipc) goto err; - if (sock_queue_rcv_skb(&ipc->sk, skb)) + if (sock_queue_rcv_skb(&ipc->sk, skb)) { + qrtr_port_put(ipc); goto err; + } qrtr_port_put(ipc); } @@ -751,7 +757,7 @@ static void qrtr_reset_ports(void) xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; - ipc->sk.sk_error_report(&ipc->sk); + sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); @@ -839,6 +845,8 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ + if (ipc) + qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } @@ -1153,14 +1161,14 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: - if (copy_from_user(&ifr, argp, sizeof(ifr))) { + if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; - if (copy_to_user(argp, &ifr, sizeof(ifr))) { + if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 9b6ffff72f2d..28c1b0022178 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -131,9 +131,9 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_len)) + if (unlikely(ret != ibmr->sg_dma_len)) return ret < 0 ? ret : -EINVAL; if (cmpxchg(&frmr->fr_state, diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c index ff97e8eda858..006b2e441418 100644 --- a/net/rds/ib_ring.c +++ b/net/rds/ib_ring.c @@ -141,7 +141,7 @@ int rds_ib_ring_low(struct rds_ib_work_ring *ring) } /* - * returns the oldest alloced ring entry. This will be the next one + * returns the oldest allocated ring entry. This will be the next one * freed. This can't be called if there are none allocated. */ u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 4e64598176b0..5461d77fff4f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -78,6 +78,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_path_drop(cp, false); + break; default: break; } diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 42c5ff1eda95..f4ee13da90c7 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -177,7 +177,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } tc->t_tinc = tinc; - rdsdebug("alloced tinc %p\n", tinc); + rdsdebug("allocated tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = diff --git a/net/rds/threads.c b/net/rds/threads.c index 32dc50f0a303..1f424cbfcbb4 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -208,6 +208,7 @@ void rds_send_worker(struct work_struct *work) case -ENOMEM: rds_stats_inc(s_send_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 2); + break; default: break; } @@ -232,6 +233,7 @@ void rds_recv_worker(struct work_struct *work) case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); + break; default: break; } diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 0885b22e5c0e..accd35c05577 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -21,6 +21,8 @@ config AF_RXRPC See Documentation/networking/rxrpc.rst. +if AF_RXRPC + config AF_RXRPC_IPV6 bool "IPv6 support for RxRPC" depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC) @@ -30,7 +32,6 @@ config AF_RXRPC_IPV6 config AF_RXRPC_INJECT_LOSS bool "Inject packet loss into RxRPC packet stream" - depends on AF_RXRPC help Say Y here to inject packet loss by discarding some received and some transmitted packets. @@ -38,7 +39,6 @@ config AF_RXRPC_INJECT_LOSS config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" - depends on AF_RXRPC help Say Y here to make runtime controllable debugging messages appear. @@ -47,7 +47,6 @@ config AF_RXRPC_DEBUG config RXKAD bool "RxRPC Kerberos security" - depends on AF_RXRPC select CRYPTO select CRYPTO_MANAGER select CRYPTO_SKCIPHER @@ -58,3 +57,5 @@ config RXKAD through the use of the key retention service. See Documentation/networking/rxrpc.rst. + +endif diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 41671af6b33f..2b5f89713e36 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -471,6 +471,7 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, switch (rx->sk.sk_state) { case RXRPC_UNBOUND: rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + break; case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: break; diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 3ce6d628cd75..19e929c7c38b 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -77,7 +77,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, } /* - * Process event packets targetted at a local endpoint. + * Process event packets targeted at a local endpoint. */ void rxrpc_process_local_events(struct rxrpc_local *local) { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f6d5755d669e..7dd3a2dc5fa4 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -381,7 +381,8 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, } mutex_unlock(&idrinfo->lock); - if (nla_put_u32(skb, TCA_FCNT, n_i)) + ret = nla_put_u32(skb, TCA_FCNT, n_i); + if (ret) goto nla_put_failure; nla_nest_end(skb, nest); @@ -494,7 +495,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; - p->tcfa_flags = flags; + p->tcfa_flags = flags & TCA_ACT_FLAGS_USER_MASK; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, @@ -940,7 +941,7 @@ void tcf_idr_insert_many(struct tc_action *actions[]) } } -struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, bool rtnl_held, struct netlink_ext_ack *extack) { @@ -950,7 +951,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, struct nlattr *kind; int err; - if (name == NULL) { + if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) @@ -966,7 +967,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, return ERR_PTR(err); } } else { - if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { + if (strlcpy(act_name, "police", IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } @@ -1003,12 +1004,11 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, struct tc_action_ops *a_o, int *init_res, - bool rtnl_held, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { - struct nla_bitfield32 flags = { 0, 0 }; + bool police = flags & TCA_ACT_FLAGS_POLICE; + struct nla_bitfield32 userflags = { 0, 0 }; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_cookie *cookie = NULL; @@ -1016,7 +1016,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; /* backward compatibility for policer */ - if (name == NULL) { + if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) @@ -1031,22 +1031,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, } hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) - flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); + userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, flags.value, extack); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp, + userflags.value | flags, extack); } else { - err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, flags.value, extack); + err = a_o->init(net, nla, est, &a, tp, userflags.value | flags, + extack); } if (err < 0) goto err_out; *init_res = err; - if (!name && tb[TCA_ACT_COOKIE]) + if (!police && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); - if (!name) + if (!police) a->hw_stats = hw_stats; return a; @@ -1062,9 +1062,9 @@ err_out: /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, int bind, - struct tc_action *actions[], int init_res[], size_t *attr_size, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action *actions[], + int init_res[], size_t *attr_size, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -1081,7 +1081,9 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack); + a_o = tc_action_load_ops(tb[i], flags & TCA_ACT_FLAGS_POLICE, + !(flags & TCA_ACT_FLAGS_NO_RTNL), + extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; @@ -1090,9 +1092,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - ops[i - 1], &init_res[i - 1], rtnl_held, - extack); + act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1], + &init_res[i - 1], flags, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1112,7 +1113,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, goto err_mod; err: - tcf_action_destroy(actions, bind); + tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { if (ops[i]) @@ -1350,8 +1351,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - return 0; if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); @@ -1422,8 +1421,6 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } @@ -1480,7 +1477,6 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; - int err = 0; skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, GFP_KERNEL); @@ -1494,15 +1490,12 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr, + struct nlmsghdr *n, u32 portid, u32 flags, struct netlink_ext_ack *extack) { size_t attr_size = 0; @@ -1511,8 +1504,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, int init_res[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, - actions, init_res, &attr_size, true, extack); + ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res, + &attr_size, flags, extack); if (ret != -EAGAIN) break; } @@ -1542,7 +1535,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; - int ret = 0, ovr = 0; + u32 flags = 0; + int ret = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) @@ -1568,8 +1562,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) - ovr = 1; - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, + flags = TCA_ACT_FLAGS_REPLACE; + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags, extack); break; case RTM_DELACTION: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e48e980c3b93..5c36013339e1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -43,20 +43,18 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - rcu_read_lock(); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); - rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the @@ -277,11 +275,11 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; @@ -319,7 +317,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - if (!replace) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); return -EEXIST; } diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e19885d7fe2c..94e78ac7a748 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,12 +96,12 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_connmark_info *ci; struct tc_connmark *parm; @@ -144,7 +144,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci = to_connmark(*a); if (bind) return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4fa4fcb842ba..a15ec95e69c3 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -41,11 +41,12 @@ static unsigned int csum_net_id; static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -78,7 +79,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index a656baa321fe..ad9df0cb4b98 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -322,11 +322,22 @@ err_alloc: static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) { + struct flow_block_cb *block_cb, *tmp_cb; struct tcf_ct_flow_table *ct_ft; + struct flow_block *block; ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, rwork); nf_flow_table_free(&ct_ft->nf_ft); + + /* Remove any remaining callbacks before cleanup */ + block = &ct_ft->nf_ft.flow_block; + down_write(&ct_ft->nf_ft.flow_block_lock); + list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + up_write(&ct_ft->nf_ft.flow_block_lock); kfree(ct_ft); module_put(THIS_MODULE); @@ -1026,7 +1037,8 @@ do_nat: /* This will take care of sending queued events * even if the connection is already confirmed. */ - nf_conntrack_confirm(skb); + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + goto drop; } if (!skip_add) @@ -1223,11 +1235,11 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int replace, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_ct_params *params = NULL; struct nlattr *tb[TCA_CT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -1267,7 +1279,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (bind) return 0; - if (!replace) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index b20c8ce59905..549374a2d008 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -154,11 +154,11 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; struct tcf_ctinfo_params *cp_new; @@ -221,7 +221,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind) /* don't override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 73c3926358a0..d8dce173df37 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -52,11 +52,11 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GACT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_gact *parm; @@ -109,7 +109,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index a78cb7965718..7df72a4197a3 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -295,12 +295,12 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gate_net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; @@ -364,7 +364,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a2ddea04183a..7064a365a1a9 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -479,11 +479,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -532,7 +532,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, kfree(p); return err; } - err = load_metalist(tb2, rtnl_held); + err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; @@ -560,7 +560,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; @@ -600,7 +600,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } if (tb[TCA_IFE_METALST]) { - err = populate_metalist(ife, tb2, exists, rtnl_held); + err = populate_metalist(ife, tb2, exists, + !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index ac7297f42355..265b1443e252 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -94,10 +94,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - const struct tc_action_ops *ops, int ovr, int bind, + const struct tc_action_ops *ops, struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; @@ -154,7 +155,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -201,21 +202,21 @@ err1: } static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp, flags); + return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, + tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool unlocked, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp, flags); + return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, + tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7153c67f641e..d64b0eeccbe4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -78,8 +78,7 @@ static void tcf_mirred_release(struct tc_action *a) /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); - if (dev) - dev_put(dev); + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -91,11 +90,11 @@ static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; @@ -155,7 +154,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -180,8 +179,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, mac_header_xmit = dev_is_mac_header_xmit(dev); dev = rcu_replace_pointer(m->tcfm_dev, dev, lockdep_is_held(&m->tcf_lock)); - if (dev) - dev_put(dev); + dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -273,6 +271,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } + /* All mirred/redirected skbs should clear previous ct info */ + nf_reset_ct(skb2); + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); expects_nh = want_ingress || !m_mac_header_xmit; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index d1486ea496a2..e4529b428cf4 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -152,11 +152,11 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MPLS_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_mpls_params *p; @@ -255,7 +255,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 1ebd2a86d980..7dd6b586ba7f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -34,11 +34,11 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action **a, int ovr, int bind, - bool rtnl_held, struct tcf_proto *tp, + struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_NAT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_nat *parm; @@ -70,7 +70,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, } else if (err > 0) { if (bind) return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b45304446e13..c6c862c459cc 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -136,11 +136,11 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_pedit_key *keys = NULL; @@ -198,7 +198,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind) goto out_free; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0fab8de176d2..832157a840fc 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -48,11 +48,11 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_police *parm; @@ -97,7 +97,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; spin_lock_init(&(to_police(*a)->tcfp_lock)); - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 6a0c16e4351d..230501eb9e06 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -34,11 +34,12 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { }; static int tcf_sample_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; u32 psample_group_num, rate, index; @@ -75,7 +76,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 726cc956d06f..cbbe1861d3a2 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -85,11 +85,11 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; @@ -147,7 +147,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e5f3fb8b00e3..605418538347 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -96,11 +96,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -186,7 +186,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - if (!ovr) { + if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 81a1c67335be..ecb9ee666095 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -6,10 +6,12 @@ */ #include <linux/module.h> +#include <linux/if_arp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/inet_ecn.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -20,33 +22,49 @@ static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; -#define MAX_EDIT_LEN ETH_HLEN static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action; + int action, max_edit_len, err; struct tcf_skbmod_params *p; u64 flags; - int err; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - /* XXX: if you are going to edit more fields beyond ethernet header - * (example when you add IP header replacement or vlan swap) - * then MAX_EDIT_LEN needs to change appropriately - */ - err = skb_ensure_writable(skb, MAX_EDIT_LEN); - if (unlikely(err)) /* best policy is to drop on the floor */ - goto drop; - action = READ_ONCE(d->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop; + max_edit_len = skb_mac_header_len(skb); p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; + + /* tcf_skbmod_init() guarantees "flags" to be one of the following: + * 1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE} + * 2. SKBMOD_F_SWAPMAC + * 3. SKBMOD_F_ECN + * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet + * packets. + */ + if (flags == SKBMOD_F_ECN) { + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + case cpu_to_be16(ETH_P_IPV6): + max_edit_len += skb_network_header_len(skb); + break; + default: + goto out; + } + } else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) { + goto out; + } + + err = skb_ensure_writable(skb, max_edit_len); + if (unlikely(err)) /* best policy is to drop on the floor */ + goto drop; + if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); if (flags & SKBMOD_F_SMAC) @@ -62,6 +80,10 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr); } + if (flags & SKBMOD_F_ECN) + INET_ECN_set_ce(skb); + +out: return action; drop: @@ -78,11 +100,12 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); + bool ovr = flags & TCA_ACT_FLAGS_REPLACE; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SKBMOD_MAX + 1]; struct tcf_skbmod_params *p, *p_old; struct tcf_chain *goto_ch = NULL; @@ -125,6 +148,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, index = parm->index; if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; + if (parm->flags & SKBMOD_F_ECN) + lflags = SKBMOD_F_ECN; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 85c0d0d5b9da..d9cd174eecb7 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -355,11 +355,11 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; @@ -504,7 +504,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "TC IDR already exists"); ret = -EEXIST; goto release_tun_meta; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1cac3c6fbb49..e4dc5a555bd8 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -70,7 +70,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* replace the vid */ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (p->tcfv_push_prio) { + if (p->tcfv_push_prio_exists) { tci &= ~VLAN_PRIO_MASK; tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } @@ -114,13 +114,14 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; + bool push_prio_exists = false; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; @@ -189,7 +190,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, push_proto = htons(ETH_P_8021Q); } - if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY]; + if (push_prio_exists) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; case TCA_VLAN_ACT_POP_ETH: @@ -221,7 +223,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -241,6 +243,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; + p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH; p->tcfv_push_proto = push_proto; if (action == TCA_VLAN_ACT_PUSH_ETH) { @@ -304,8 +307,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, p->tcfv_push_proto) || - (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - p->tcfv_push_prio)))) + (p->tcfv_push_prio_exists && + nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio)))) goto nla_put_failure; if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 279f9e2a2319..2ef8f5a6205a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -634,6 +634,7 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, bo->block_shared = shared; bo->extack = extack; bo->sch = sch; + bo->cb_list_head = &flow_block->cb_list; INIT_LIST_HEAD(&bo->cb_list); } @@ -1531,7 +1532,7 @@ static inline int __tcf_classify(struct sk_buff *skb, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT - const int max_reclassify_loop = 4; + const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; int limit = 0; @@ -1577,21 +1578,11 @@ reset: #endif } -int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +int tcf_classify(struct sk_buff *skb, + const struct tcf_block *block, + const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { - u32 last_executed_chain = 0; - - return __tcf_classify(skb, tp, tp, res, compat_mode, - &last_executed_chain); -} -EXPORT_SYMBOL(tcf_classify); - -int tcf_classify_ingress(struct sk_buff *skb, - const struct tcf_block *ingress_block, - const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; @@ -1603,20 +1594,22 @@ int tcf_classify_ingress(struct sk_buff *skb, struct tc_skb_ext *ext; int ret; - ext = skb_ext_find(skb, TC_SKB_EXT); + if (block) { + ext = skb_ext_find(skb, TC_SKB_EXT); - if (ext && ext->chain) { - struct tcf_chain *fchain; + if (ext && ext->chain) { + struct tcf_chain *fchain; - fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain); - if (!fchain) - return TC_ACT_SHOT; + fchain = tcf_chain_lookup_rcu(block, ext->chain); + if (!fchain) + return TC_ACT_SHOT; - /* Consume, so cloned/redirect skbs won't inherit ext */ - skb_ext_del(skb, TC_SKB_EXT); + /* Consume, so cloned/redirect skbs won't inherit ext */ + skb_ext_del(skb, TC_SKB_EXT); - tp = rcu_dereference_bh(fchain->filter_chain); - last_executed_chain = fchain->index; + tp = rcu_dereference_bh(fchain->filter_chain); + last_executed_chain = fchain->index; + } } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, @@ -1635,7 +1628,7 @@ int tcf_classify_ingress(struct sk_buff *skb, return ret; #endif } -EXPORT_SYMBOL(tcf_classify_ingress); +EXPORT_SYMBOL(tcf_classify); struct tcf_chain_info { struct tcf_proto __rcu **pprev; @@ -1870,13 +1863,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - - if (err > 0) - err = 0; return err; } @@ -1909,15 +1899,13 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); - if (err > 0) - err = 0; return err; } @@ -1962,6 +1950,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, int err; int tp_created; bool rtnl_held = false; + u32 flags; if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1982,6 +1971,7 @@ replay: tp = NULL; cl = 0; block = NULL; + flags = 0; if (prio == 0) { /* If no priority is provided by the user, @@ -2125,9 +2115,12 @@ replay: goto errout; } + if (!(n->nlmsg_flags & NLM_F_CREATE)) + flags |= TCA_ACT_FLAGS_REPLACE; + if (!rtnl_held) + flags |= TCA_ACT_FLAGS_NO_RTNL; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, - n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, - rtnl_held, extack); + flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held); @@ -2711,13 +2704,11 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } @@ -2741,7 +2732,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, } if (unicast) - return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + return rtnl_unicast(skb, net, portid); return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } @@ -2904,7 +2895,7 @@ replay: break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, - n->nlmsg_seq, n->nlmsg_type, true); + n->nlmsg_flags, n->nlmsg_type, true); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; @@ -3035,8 +3026,8 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr *rate_tlv, struct tcf_exts *exts, + u32 flags, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { @@ -3047,13 +3038,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack); + a_o = tc_action_load_ops(tb[exts->police], true, + !(flags & TCA_ACT_FLAGS_NO_RTNL), + extack); if (IS_ERR(a_o)) return PTR_ERR(a_o); + flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; act = tcf_action_init_1(net, tp, tb[exts->police], - rate_tlv, "police", ovr, - TCA_ACT_BIND, a_o, init_res, - rtnl_held, extack); + rate_tlv, a_o, init_res, flags, + extack); module_put(a_o->owner); if (IS_ERR(act)) return PTR_ERR(act); @@ -3065,10 +3058,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, } else if (exts->action && tb[exts->action]) { int err; + flags |= TCA_ACT_FLAGS_BIND; err = tcf_action_init(net, tp, tb[exts->action], - rate_tlv, NULL, ovr, TCA_ACT_BIND, - exts->actions, init_res, - &attr_size, rtnl_held, extack); + rate_tlv, exts->actions, init_res, + &attr_size, flags, extack); if (err < 0) return err; exts->nr_actions = err; @@ -3832,7 +3825,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru fl = rcu_dereference_bh(qe->filter_chain); - switch (tcf_classify(skb, fl, &cl_res, false)) { + switch (tcf_classify(skb, NULL, fl, &cl_res, false)) { case TC_ACT_SHOT: qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index f256a7c69093..8158fc9ee1ab 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -145,12 +145,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; @@ -169,8 +169,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, + u32 flags, struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); @@ -216,7 +216,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, goto errout; } - err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags, extack); if (err < 0) { if (!fold) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 6e3e63db0e01..df19a847829e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -85,8 +85,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - /* Needed here for accessing maps. */ - rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; @@ -98,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); } if (prog->exts_integrated) { @@ -131,7 +129,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, break; } - rcu_read_unlock(); return ret; } @@ -407,7 +404,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, - struct nlattr **tb, struct nlattr *est, bool ovr, + struct nlattr **tb, struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { bool is_bpf, is_ebpf, have_exts = false; @@ -419,7 +416,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true, + ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, flags, extack); if (ret < 0) return ret; @@ -458,7 +455,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); @@ -503,7 +500,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; prog->handle = handle; - ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr, + ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], flags, extack); if (ret < 0) goto errout_idr; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index fb881144fa01..ed00001b528a 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -76,7 +76,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work) static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; @@ -108,8 +108,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags, + extack); if (err < 0) goto errout; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 87398af2715a..972303aa8edd 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -387,7 +387,7 @@ static void flow_destroy_filter_work(struct work_struct *work) static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); @@ -442,8 +442,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err2; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags, + extack); if (err < 0) goto err2; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d7869a984881..23b21253b3c3 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1915,23 +1915,22 @@ errout_cleanup: static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, - struct fl_flow_tmplt *tmplt, bool rtnl_held, + struct nlattr *est, + struct fl_flow_tmplt *tmplt, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held, - extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); - if (!rtnl_held) + if (flags & TCA_ACT_FLAGS_NO_RTNL) rtnl_lock(); tcf_bind_filter(tp, &f->res, base); - if (!rtnl_held) + if (flags & TCA_ACT_FLAGS_NO_RTNL) rtnl_unlock(); } @@ -1975,10 +1974,11 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew, static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); + bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); struct cls_fl_filter *fold = *arg; struct cls_fl_filter *fnew; struct fl_flow_mask *mask; @@ -2034,8 +2034,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } - err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr, - tp->chain->tmplt_priv, rtnl_held, extack); + err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], + tp->chain->tmplt_priv, flags, extack); if (err) goto errout; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index ec945294626a..8654b0ce997c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -198,15 +198,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, - struct nlattr **tca, unsigned long base, bool ovr, + struct nlattr **tca, unsigned long base, u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags, + extack); if (err < 0) return err; @@ -237,8 +237,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, - bool ovr, bool rtnl_held, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; @@ -277,7 +276,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return err; } - err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack); + err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); @@ -326,7 +325,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, f->id = handle; f->tp = tp; - err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack); + err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index cafb84480bab..24f0046ce0b3 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -163,13 +163,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { static int mall_set_parms(struct net *net, struct tcf_proto *tp, struct cls_mall_head *head, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true, - extack); + err = tcf_exts_validate(net, tp, tb, est, &head->exts, flags, extack); if (err < 0) return err; @@ -183,13 +182,13 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; struct cls_mall_head *new; - u32 flags = 0; + u32 userflags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -204,8 +203,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return err; if (tb[TCA_MATCHALL_FLAGS]) { - flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); - if (!tc_flags_valid(flags)) + userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); + if (!tc_flags_valid(userflags)) return -EINVAL; } @@ -220,14 +219,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!handle) handle = 1; new->handle = handle; - new->flags = flags; + new->flags = userflags; new->pf = alloc_percpu(struct tc_matchall_pcnt); if (!new->pf) { err = -ENOMEM; goto err_alloc_percpu; } - err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, + err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], flags, extack); if (err) goto err_set_parms; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 5efa3e7ace15..a35ab8c27866 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -382,7 +382,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, - bool ovr, struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; @@ -390,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; @@ -464,8 +464,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, u32 flags, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -510,7 +510,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, } err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], new, ovr, extack); + tca[TCA_RATE], new, flags, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 2e288f88ff02..5cd9d6b143c4 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -7,7 +7,7 @@ /* Comparing to general packet classification problem, - RSVP needs only sevaral relatively simple rules: + RSVP needs only several relatively simple rules: * (dst, protocol) are always specified, so that we are able to hash them. @@ -470,9 +470,8 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + u32 handle, struct nlattr **tca, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); @@ -499,7 +498,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true, + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags, extack); if (err < 0) goto errout2; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index c4007b9cd16d..742c7d49a958 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -278,6 +278,8 @@ static int tcindex_filter_result_init(struct tcindex_filter_result *r, TCA_TCINDEX_POLICE); } +static void tcindex_free_perfect_hash(struct tcindex_data *cp); + static void tcindex_partial_destroy_work(struct work_struct *work) { struct tcindex_data *p = container_of(to_rcu_work(work), @@ -285,7 +287,8 @@ static void tcindex_partial_destroy_work(struct work_struct *work) rwork); rtnl_lock(); - kfree(p->perfect); + if (p->perfect) + tcindex_free_perfect_hash(p); kfree(p); rtnl_unlock(); } @@ -304,7 +307,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) int i, err = 0; cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!cp->perfect) return -ENOMEM; @@ -327,7 +330,7 @@ static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_data *cp = NULL, *oldp; @@ -339,7 +342,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack); if (err < 0) goto errout; @@ -526,8 +529,8 @@ errout: static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, u32 flags, + struct netlink_ext_ack *extack) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; @@ -548,7 +551,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, return err; return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr, extack); + tca[TCA_RATE], flags, extack); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 6e1abe805448..4272814487f0 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -709,12 +709,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { static int u32_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &n->exts, flags, extack); if (err < 0) return err; @@ -840,7 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, bool rtnl_held, + struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; @@ -849,7 +849,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; - u32 htid, flags = 0; + u32 htid, userflags = 0; size_t sel_size; int err; @@ -868,8 +868,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; if (tb[TCA_U32_FLAGS]) { - flags = nla_get_u32(tb[TCA_U32_FLAGS]); - if (!tc_flags_valid(flags)) { + userflags = nla_get_u32(tb[TCA_U32_FLAGS]); + if (!tc_flags_valid(userflags)) { NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; } @@ -884,7 +884,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - if ((n->flags ^ flags) & + if ((n->flags ^ userflags) & ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) { NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; @@ -895,7 +895,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -ENOMEM; err = u32_set_parms(net, tp, base, new, tb, - tca[TCA_RATE], ovr, extack); + tca[TCA_RATE], flags, extack); if (err) { u32_destroy_key(new, false); @@ -955,9 +955,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->handle = handle; ht->prio = tp->prio; idr_init(&ht->handle_idr); - ht->flags = flags; + ht->flags = userflags; - err = u32_replace_hw_hnode(tp, ht, flags, extack); + err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { idr_remove(&tp_c->handle_idr, handle); kfree(ht); @@ -1038,7 +1038,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; - n->flags = flags; + n->flags = userflags; err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) @@ -1060,7 +1060,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr, + err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], flags, extack); if (err == 0) { struct tc_u_knode __rcu **ins; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index f885bea5b452..4ce681361851 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -141,7 +141,7 @@ errout: EXPORT_SYMBOL(tcf_em_register); /** - * tcf_em_unregister - unregster and extended match + * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f87d07736a14..5e90e9b160e3 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1845,7 +1845,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) @@ -1856,11 +1855,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, @@ -1894,8 +1890,6 @@ static int tclass_del_notify(struct net *net, err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index d0c9a57398fc..7d8518176b45 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -394,7 +394,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 951542843cab..3c2300d14468 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -720,7 +720,7 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, skip_hash: if (flow_override) flow_hash = flow_override - 1; - else if (use_skbhash) + else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; @@ -1665,7 +1665,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index b79a7e27bb31..e0da15530f0e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -228,7 +228,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (!fl || result < 0) goto fallback; @@ -1614,7 +1614,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); - return err; + goto failure; } if (tca[TCA_RATE]) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index fc1e47069593..642cd179b7a7 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -317,7 +317,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index d320bcfb2da2..4c100d105269 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -242,7 +242,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, fl, &res, false); + int result = tcf_classify(skb, NULL, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index c1e84d1eeaba..1f857ffd1ac2 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -390,7 +390,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -660,6 +660,13 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); q->nbands = nbands; + for (i = nstrict; i < q->nstrict; i++) { + INIT_LIST_HEAD(&q->classes[i].alist); + if (q->classes[i].qdisc->q.qlen) { + list_add_tail(&q->classes[i].alist, &q->active); + q->classes[i].deficit = quanta[i]; + } + } q->nstrict = nstrict; memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index bbd5f8753600..bb0cd6d3d2c2 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -91,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -369,6 +369,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + u32 quantum = 0; int err; if (!opt) @@ -386,6 +387,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->flows_cnt > 65536) return -EINVAL; } + if (tb[TCA_FQ_CODEL_QUANTUM]) { + quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum > FQ_CODEL_QUANTUM_MAX) { + NL_SET_ERR_MSG(extack, "Invalid quantum"); + return -EINVAL; + } + } sch_tree_lock(sch); if (tb[TCA_FQ_CODEL_TARGET]) { @@ -412,8 +420,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_ECN]) q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); - if (tb[TCA_FQ_CODEL_QUANTUM]) - q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum) + q->quantum = quantum; if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index cac684952edc..830f3559f727 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -94,7 +94,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_pie_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc8b56bcabf3..a8dd06c74e31 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -52,6 +52,8 @@ static void qdisc_maybe_clear_missed(struct Qdisc *q, */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); + else + set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ @@ -164,9 +166,13 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) skb = next; } - if (lock) + + if (lock) { spin_unlock(lock); - __netif_schedule(q); + set_bit(__QDISC_STATE_MISSED, &q->state); + } else { + __netif_schedule(q); + } } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -409,7 +415,11 @@ void __qdisc_run(struct Qdisc *q) while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { - __netif_schedule(q); + if (q->flags & TCQ_F_NOLOCK) + set_bit(__QDISC_STATE_MISSED, &q->state); + else + __netif_schedule(q); + break; } } @@ -540,6 +550,24 @@ void netif_carrier_off(struct net_device *dev) } EXPORT_SYMBOL(netif_carrier_off); +/** + * netif_carrier_event - report carrier state event + * @dev: network device + * + * Device has detected a carrier event but the carrier state wasn't changed. + * Use in drivers when querying carrier state asynchronously, to avoid missing + * events (link flaps) if link recovers before it's queried. + */ +void netif_carrier_event(struct net_device *dev) +{ + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + atomic_inc(&dev->carrier_up_count); + atomic_inc(&dev->carrier_down_count); + linkwatch_fire_event(dev); +} +EXPORT_SYMBOL_GPL(netif_carrier_event); + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. @@ -680,13 +708,14 @@ retry: if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && - test_bit(__QDISC_STATE_MISSED, &qdisc->state)) { + READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. @@ -696,8 +725,6 @@ retry: need_retry = false; goto retry; - } else { - WRITE_ONCE(qdisc->empty, true); } return skb; @@ -886,7 +913,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); - lockdep_set_class(&sch->busylock, + lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); seqcount_init(&sch->running); @@ -898,7 +925,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - sch->empty = true; dev_hold(dev); refcount_set(&sch->refcnt, 1); @@ -1204,6 +1230,7 @@ static void dev_reset_queue(struct net_device *dev, spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index f4132dc25ac0..621dc6afde8f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -6,7 +6,7 @@ * * 991129: - Bug fix with grio mode * - a better sing. AvgQ mode with Grio(WRED) - * - A finer grained VQ dequeue based on sugestion + * - A finer grained VQ dequeue based on suggestion * from Ren Liu * - More error checks * diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index bf0034c66e35..b7ac30cca035 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1130,7 +1130,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8827987ba903..5067a6e5d4fd 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -52,7 +52,7 @@ */ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */ -#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ +#define HTB_VER 0x30011 /* major must be matched with number supplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" @@ -125,6 +125,7 @@ struct htb_class { struct htb_class_leaf { int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; + struct netdev_queue *offload_queue; } leaf; struct htb_class_inner { struct htb_prio clprio[TC_HTB_NUMPRIO]; @@ -238,7 +239,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -273,6 +274,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, /** * htb_add_to_id_tree - adds class to the round robin list + * @root: the root of the tree + * @cl: the class to add + * @prio: the give prio in class * * Routine adds class to the list (actually tree) sorted by classid. * Make sure that class is not already on such list for given prio. @@ -298,6 +302,9 @@ static void htb_add_to_id_tree(struct rb_root *root, /** * htb_add_to_wait_tree - adds class to the event queue with delay + * @q: the priority event queue + * @cl: the class to add + * @delay: delay in microseconds * * The class is added to priority event queue to indicate that class will * change its mode in cl->pq_key microseconds. Make sure that class is not @@ -331,6 +338,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, /** * htb_next_rb_node - finds next node in binary tree + * @n: the current node in binary tree * * When we are past last key we return NULL. * Average complexity is 2 steps per call. @@ -342,6 +350,9 @@ static inline void htb_next_rb_node(struct rb_node **n) /** * htb_add_class_to_row - add class to its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is added to row at priorities marked in mask. * It does nothing if mask == 0. @@ -371,6 +382,9 @@ static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) /** * htb_remove_class_from_row - removes class from its row + * @q: the priority event queue + * @cl: the class to add + * @mask: the given priorities in class in bitmap * * The class is removed from row at priorities marked in mask. * It does nothing if mask == 0. @@ -398,6 +412,8 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, /** * htb_activate_prios - creates active classe's feed chain + * @q: the priority event queue + * @cl: the class to activate * * The class is connected to ancestors and/or appropriate rows * for priorities it is participating on. cl->cmode must be new @@ -433,6 +449,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate_prios - remove class from feed chain + * @q: the priority event queue + * @cl: the class to deactivate * * cl->cmode must represent old mode (before deactivation). It does * nothing if cl->prio_activity == 0. Class is removed from all feed @@ -493,6 +511,8 @@ static inline s64 htb_hiwater(const struct htb_class *cl) /** * htb_class_mode - computes and returns current class mode + * @cl: the target class + * @diff: diff time in microseconds * * It computes cl's mode at time cl->t_c+diff and returns it. If mode * is not HTB_CAN_SEND then cl->pq_key is updated to time difference @@ -521,9 +541,12 @@ htb_class_mode(struct htb_class *cl, s64 *diff) /** * htb_change_class_mode - changes classe's mode + * @q: the priority event queue + * @cl: the target class + * @diff: diff time in microseconds * * This should be the only way how to change classe's mode under normal - * cirsumstances. Routine will update feed lists linkage, change mode + * circumstances. Routine will update feed lists linkage, change mode * and add class to the wait event queue if appropriate. New mode should * be different from old one and cl->pq_key has to be valid if changing * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). @@ -553,6 +576,8 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) /** * htb_activate - inserts leaf cl into appropriate active feeds + * @q: the priority event queue + * @cl: the target class * * Routine learns (new) priority of leaf and activates feed chain * for the prio. It can be called on already active leaf safely. @@ -570,6 +595,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) /** * htb_deactivate - remove leaf cl from active feeds + * @q: the priority event queue + * @cl: the target class * * Make sure that leaf is active. In the other words it can't be called * with non-active leaf. It also removes class from the drop list. @@ -649,6 +676,10 @@ static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) /** * htb_charge_class - charges amount "bytes" to leaf and ancestors + * @q: the priority event queue + * @cl: the class to start iterate + * @level: the minimum level to account + * @skb: the socket buffer * * Routine assumes that packet "bytes" long was dequeued from leaf cl * borrowing from "level". It accounts bytes to ceil leaky bucket for @@ -698,6 +729,9 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, /** * htb_do_events - make mode changes to classes at the level + * @q: the priority event queue + * @level: which wait_pq in 'q->hlevel' + * @start: start jiffies * * Scans event queue for pending events and applies them. Returns time of * next pending event (0 for no event in pq, q->now for too many events). @@ -766,6 +800,8 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, /** * htb_lookup_leaf - returns next leaf class in DRR order + * @hprio: the current one + * @prio: which prio in class * * Find leaf where current feed pointers points to. */ @@ -1376,24 +1412,47 @@ htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q) return old_q; } -static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new) +static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl) +{ + struct netdev_queue *queue; + + queue = cl->leaf.offload_queue; + if (!(cl->leaf.q->flags & TCQ_F_BUILTIN)) + WARN_ON(cl->leaf.q->dev_queue != queue); + + return queue; +} + +static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old, + struct htb_class *cl_new, bool destroying) { struct netdev_queue *queue_old, *queue_new; struct net_device *dev = qdisc_dev(sch); - struct Qdisc *qdisc; - queue_old = netdev_get_tx_queue(dev, qid_old); - queue_new = netdev_get_tx_queue(dev, qid_new); + queue_old = htb_offload_get_queue(cl_old); + queue_new = htb_offload_get_queue(cl_new); - if (dev->flags & IFF_UP) - dev_deactivate(dev); - qdisc = dev_graft_qdisc(queue_old, NULL); - qdisc->dev_queue = queue_new; - qdisc = dev_graft_qdisc(queue_new, qdisc); - if (dev->flags & IFF_UP) - dev_activate(dev); + if (!destroying) { + struct Qdisc *qdisc; - WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); + if (dev->flags & IFF_UP) + dev_deactivate(dev); + qdisc = dev_graft_qdisc(queue_old, NULL); + WARN_ON(qdisc != cl_old->leaf.q); + } + + if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN)) + cl_old->leaf.q->dev_queue = queue_new; + cl_old->leaf.offload_queue = queue_new; + + if (!destroying) { + struct Qdisc *qdisc; + + qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q); + if (dev->flags & IFF_UP) + dev_activate(dev); + WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); + } } static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, @@ -1407,10 +1466,8 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (cl->level) return -EINVAL; - if (q->offload) { - dev_queue = new->dev_queue; - WARN_ON(dev_queue != cl->leaf.q->dev_queue); - } + if (q->offload) + dev_queue = htb_offload_get_queue(cl); if (!new) { new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @@ -1479,6 +1536,8 @@ static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl, parent->ctokens = parent->cbuffer; parent->t_c = ktime_get_ns(); parent->cmode = HTB_CAN_SEND; + if (q->offload) + parent->leaf.offload_queue = cl->leaf.offload_queue; } static void htb_parent_to_leaf_offload(struct Qdisc *sch, @@ -1499,6 +1558,7 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, struct netlink_ext_ack *extack) { struct tc_htb_qopt_offload offload_opt; + struct netdev_queue *dev_queue; struct Qdisc *q = cl->leaf.q; struct Qdisc *old = NULL; int err; @@ -1507,16 +1567,15 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, return -EINVAL; WARN_ON(!q); - if (!destroying) { - /* On destroy of HTB, two cases are possible: - * 1. q is a normal qdisc, but q->dev_queue has noop qdisc. - * 2. q is a noop qdisc (for nodes that were inner), - * q->dev_queue is noop_netdev_queue. + dev_queue = htb_offload_get_queue(cl); + old = htb_graft_helper(dev_queue, NULL); + if (destroying) + /* Before HTB is destroyed, the kernel grafts noop_qdisc to + * all queues. */ - old = htb_graft_helper(q->dev_queue, NULL); - WARN_ON(!old); + WARN_ON(!(old->flags & TCQ_F_BUILTIN)); + else WARN_ON(old != q); - } if (cl->parent) { cl->parent->bstats_bias.bytes += q->bstats.bytes; @@ -1535,18 +1594,17 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, if (!err || destroying) qdisc_put(old); else - htb_graft_helper(q->dev_queue, old); + htb_graft_helper(dev_queue, old); if (last_child) return err; - if (!err && offload_opt.moved_qid != 0) { - if (destroying) - q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch), - offload_opt.qid); - else - htb_offload_move_qdisc(sch, offload_opt.moved_qid, - offload_opt.qid); + if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) { + u32 classid = TC_H_MAJ(sch->handle) | + TC_H_MIN(offload_opt.classid); + struct htb_class *moved_cl = htb_find(classid, sch); + + htb_offload_move_qdisc(sch, moved_cl, cl, destroying); } return err; @@ -1669,9 +1727,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, } if (last_child) { - struct netdev_queue *dev_queue; + struct netdev_queue *dev_queue = sch->dev_queue; + + if (q->offload) + dev_queue = htb_offload_get_queue(cl); - dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue; new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); @@ -1843,7 +1903,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); } else { /* First child. */ - dev_queue = parent->leaf.q->dev_queue; + dev_queue = htb_offload_get_queue(parent); old_q = htb_graft_helper(dev_queue, NULL); WARN_ON(old_q != parent->leaf.q); offload_opt = (struct tc_htb_qopt_offload) { @@ -1900,6 +1960,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* leaf (we) needs elementary qdisc */ cl->leaf.q = new_q ? new_q : &noop_qdisc; + if (q->offload) + cl->leaf.offload_queue = dev_queue; cl->parent = parent; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 5c27b4270b90..e282e7382117 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -36,7 +36,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 3eabb871a1d5..03fdf31ccb6a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -39,7 +39,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 1db9d4a2ef5e..58a9d42b52b8 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -485,11 +485,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl->qdisc != &noop_qdisc) qdisc_hash_add(cl->qdisc, true); - sch_tree_lock(sch); - qdisc_class_hash_insert(&q->clhash, &cl->common); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); set_change_agg: sch_tree_lock(sch); @@ -507,8 +502,11 @@ set_change_agg: } if (existing) qfq_deact_rm_from_agg(q, cl); + else + qdisc_class_hash_insert(&q->clhash, &cl->common); qfq_add_to_agg(q, new_agg, cl); sch_tree_unlock(sch); + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; @@ -692,7 +690,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index dde829d4b9f8..3d061a13d7ed 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -257,7 +257,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 066754a18569..f8e569f79f13 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -178,7 +178,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 5c91df52b8c2..1ab2fc933a21 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -114,9 +114,6 @@ static void taprio_free_sched_cb(struct rcu_head *head) struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; - if (!sched) - return; - list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); @@ -438,6 +435,11 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child; int queue; + if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { + WARN_ONCE(1, "Trying to enqueue skb into the root of a taprio qdisc configured with full offload\n"); + return qdisc_drop(skb, sch, to_free); + } + queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; @@ -529,23 +531,7 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->peek(child); - if (!skb) - continue; - - return skb; - } + WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -578,7 +564,7 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 - * "AdminGateSates" + * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; @@ -654,27 +640,7 @@ done: static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) { - struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); - struct sk_buff *skb; - int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - - if (unlikely(!child)) - continue; - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - continue; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - return skb; - } + WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); return NULL; } @@ -1547,7 +1513,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_set_picos_per_byte(dev, q); if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); + err = netdev_set_num_tc(dev, mqprio->num_tc); + if (err) + goto free_sched; for (i = 0; i < mqprio->num_tc; i++) netdev_set_tc_queue(dev, i, mqprio->count[i], @@ -1759,6 +1727,35 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return taprio_change(sch, opt, extack); } +static void taprio_attach(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + struct Qdisc *old; + + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + } else { + old = dev_graft_qdisc(qdisc->dev_queue, sch); + qdisc_refcount_inc(sch); + } + if (old) + qdisc_put(old); + } + + /* access to the child qdiscs is not needed in offload mode */ + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + kfree(q->qdiscs); + q->qdiscs = NULL; + } +} + static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { @@ -1785,8 +1782,12 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + *old = dev_graft_qdisc(dev_queue, new); + } else { + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + } if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; @@ -2020,6 +2021,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, + .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 336df4b36655..be29da09cc7a 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -98,6 +98,7 @@ static struct sctp_association *sctp_association_init( * sock configured value. */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + asoc->probe_interval = msecs_to_jiffies(sp->probe_interval); asoc->encap_port = sp->encap_port; @@ -625,6 +626,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * association configured value. */ peer->hbinterval = asoc->hbinterval; + peer->probe_interval = asoc->probe_interval; peer->encap_port = asoc->encap_port; @@ -714,6 +716,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return NULL; } + sctp_transport_pl_reset(peer); + /* Attach the remote transport to our asoc. */ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; @@ -812,6 +816,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, spc_state = SCTP_ADDR_CONFIRMED; transport->state = SCTP_ACTIVE; + sctp_transport_pl_reset(transport); break; case SCTP_TRANSPORT_DOWN: @@ -821,6 +826,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, */ if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; + sctp_transport_pl_reset(transport); spc_state = SCTP_ADDR_UNREACHABLE; } else { sctp_transport_dst_release(transport); diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 6f8319b828b0..db6b7373d16c 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -857,12 +857,18 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); cur_key->key = key; - if (replace) { - list_del_init(&shkey->key_list); - sctp_auth_shkey_release(shkey); + if (!replace) { + list_add(&cur_key->key_list, sh_keys); + return 0; } + + list_del_init(&shkey->key_list); + sctp_auth_shkey_release(shkey); list_add(&cur_key->key_list, sh_keys); + if (asoc && asoc->active_key_id == auth_key->sca_keynumber) + sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + return 0; } diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 53e5ed79f63f..59e653b528b1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -270,22 +270,19 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); - if (unlikely(!af)) { + if (unlikely(!af) || + !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; - sctp_bind_addr_clean(bp); - break; + goto out_err; } - af->from_addr_param(&addr, rawaddr, htons(port), 0); if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); - if (retval) { + if (retval) /* Can't finish building the list, clean up. */ - sctp_bind_addr_clean(bp); - break; - } + goto out_err; next: len = ntohs(param->length); @@ -294,6 +291,12 @@ next: } return retval; + +out_err: + if (retval) + sctp_bind_addr_clean(bp); + + return retval; } /******************************************************************** diff --git a/net/sctp/debug.c b/net/sctp/debug.c index c4d9c7feffb9..ccd773e4c371 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -154,6 +154,7 @@ static const char *const sctp_timer_tbl[] = { "TIMEOUT_T5_SHUTDOWN_GUARD", "TIMEOUT_HEARTBEAT", "TIMEOUT_RECONF", + "TIMEOUT_PROBE", "TIMEOUT_SACK", "TIMEOUT_AUTOCLOSE", }; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 493fc01e5d2b..760b367644c1 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -284,10 +284,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) goto out; } - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + out: return err; } diff --git a/net/sctp/input.c b/net/sctp/input.c index d508f6f3dd08..5ef86fdb1176 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -385,7 +385,9 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (!t || (t->pathmtu <= pmtu)) + if (!t || + (t->pathmtu <= pmtu && + t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { @@ -554,6 +556,50 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) sctp_transport_put(t); } +static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, + __u8 type, __u8 code, __u32 info) +{ + struct sctp_association *asoc = t->asoc; + struct sock *sk = asoc->base.sk; + int err = 0; + + switch (type) { + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + return; + if (code == ICMP_FRAG_NEEDED) { + sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); + return; + } + if (code == ICMP_PROT_UNREACH) { + sctp_icmp_proto_unreachable(sk, asoc, t); + return; + } + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + if (code == ICMP_EXC_FRAGTIME) + return; + + err = EHOSTUNREACH; + break; + case ICMP_REDIRECT: + sctp_icmp_redirect(sk, t, skb); + return; + default: + return; + } + if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) { + sk->sk_err = err; + sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -572,22 +618,19 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; - const int ihlen = iph->ihl * 4; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; - struct sctp_association *asoc = NULL; + struct net *net = dev_net(skb->dev); struct sctp_transport *transport; - struct inet_sock *inet; + struct sctp_association *asoc; __u16 saveip, savesctp; - int err; - struct net *net = dev_net(skb->dev); + struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); - skb_set_transport_header(skb, ihlen); + skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; @@ -596,59 +639,41 @@ int sctp_v4_err(struct sk_buff *skb, __u32 info) __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } - /* Warning: The sock lock is held. Remember to call - * sctp_err_finish! - */ - switch (type) { - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out_unlock; + sctp_v4_err_handle(transport, skb, type, code, info); + sctp_err_finish(sk, transport); - /* PMTU discovery (RFC1191) */ - if (ICMP_FRAG_NEEDED == code) { - sctp_icmp_frag_needed(sk, asoc, transport, - SCTP_TRUNC4(info)); - goto out_unlock; - } else { - if (ICMP_PROT_UNREACH == code) { - sctp_icmp_proto_unreachable(sk, asoc, - transport); - goto out_unlock; - } - } - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - /* Ignore any time exceeded errors due to fragment reassembly - * timeouts. - */ - if (ICMP_EXC_FRAGTIME == code) - goto out_unlock; + return 0; +} - err = EHOSTUNREACH; - break; - case ICMP_REDIRECT: - sctp_icmp_redirect(sk, transport, skb); - /* Fall through to out_unlock. */ - default: - goto out_unlock; +int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sctp_association *asoc; + struct sctp_transport *t; + struct icmphdr *hdr; + __u32 info = 0; + + skb->transport_header += sizeof(struct udphdr); + sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); + if (!sk) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return -ENOENT; } - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { - sk->sk_err = err; - sk->sk_error_report(sk); - } else { /* Only an error on timeout */ - sk->sk_err_soft = err; + skb->transport_header -= sizeof(struct udphdr); + hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); + if (hdr->type == ICMP_REDIRECT) { + /* can't be handled without outer iphdr known, leave it to udp_err */ + sctp_err_finish(sk, t); + return 0; } + if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) + info = ntohs(hdr->un.frag.mtu); + sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); -out_unlock: - sctp_err_finish(sk, transport); - return 0; + sctp_err_finish(sk, t); + return 1; } /* @@ -1131,7 +1156,8 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, if (!af) continue; - af->from_addr_param(paddr, params.addr, sh->source, 0); + if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) + continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp); if (asoc) @@ -1167,6 +1193,9 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( union sctp_addr_param *param; union sctp_addr paddr; + if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) + return NULL; + /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); @@ -1174,7 +1203,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (unlikely(!af)) return NULL; - af->from_addr_param(&paddr, param, peer_port, 0); + if (!af->from_addr_param(&paddr, param, peer_port, 0)) + return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp); } @@ -1236,6 +1266,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, net, ch, laddr, sctp_hdr(skb)->source, transportp); + break; default: break; } @@ -1245,7 +1276,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; - } while (ch_end < skb_tail_pointer(skb)); + } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index bd08807c9e44..470dbdc27d58 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -100,8 +100,9 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && - ipv6_addr_equal(&addr->a.v6.sin6_addr, - &ifa->addr)) { + ipv6_addr_equal(&addr->a.v6.sin6_addr, + &ifa->addr) && + addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; @@ -122,54 +123,28 @@ static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; -/* ICMP error handler. */ -static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, + __u8 type, __u8 code, __u32 info) { - struct inet6_dev *idev; - struct sock *sk; - struct sctp_association *asoc; - struct sctp_transport *transport; + struct sctp_association *asoc = t->asoc; + struct sock *sk = asoc->base.sk; struct ipv6_pinfo *np; - __u16 saveip, savesctp; - int err, ret = 0; - struct net *net = dev_net(skb->dev); - - idev = in6_dev_get(skb->dev); - - /* Fix up skb to look at the embedded net header. */ - saveip = skb->network_header; - savesctp = skb->transport_header; - skb_reset_network_header(skb); - skb_set_transport_header(skb, offset); - sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); - /* Put back, the original pointers. */ - skb->network_header = saveip; - skb->transport_header = savesctp; - if (!sk) { - __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); - ret = -ENOENT; - goto out; - } - - /* Warning: The sock lock is held. Remember to call - * sctp_err_finish! - */ + int err = 0; switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) - sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); - goto out_unlock; + sctp_icmp_frag_needed(sk, asoc, t, info); + return; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { - sctp_icmp_proto_unreachable(sk, asoc, transport); - goto out_unlock; + sctp_icmp_proto_unreachable(sk, asoc, t); + return; } break; case NDISC_REDIRECT: - sctp_icmp_redirect(sk, transport, skb); - goto out_unlock; + sctp_icmp_redirect(sk, t, skb); + return; default: break; } @@ -178,18 +153,70 @@ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); - } else { /* Only an error on timeout */ + sk_error_report(sk); + } else { sk->sk_err_soft = err; } +} + +/* ICMP error handler. */ +static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + struct sctp_transport *transport; + struct sctp_association *asoc; + __u16 saveip, savesctp; + struct sock *sk; + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, offset); + sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); + /* Put back, the original pointers. */ + skb->network_header = saveip; + skb->transport_header = savesctp; + if (!sk) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return -ENOENT; + } -out_unlock: + sctp_v6_err_handle(transport, skb, type, code, ntohl(info)); sctp_err_finish(sk, transport); -out: - if (likely(idev != NULL)) - in6_dev_put(idev); - return ret; + return 0; +} + +int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sctp_association *asoc; + struct sctp_transport *t; + struct icmp6hdr *hdr; + __u32 info = 0; + + skb->transport_header += sizeof(struct udphdr); + sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t); + if (!sk) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return -ENOENT; + } + + skb->transport_header -= sizeof(struct udphdr); + hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr)); + if (hdr->icmp6_type == NDISC_REDIRECT) { + /* can't be handled without outer ip6hdr known, leave it to udpv6_err */ + sctp_err_finish(sk, t); + return 0; + } + if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG) + info = ntohl(hdr->icmp6_mtu); + sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info); + + sctp_err_finish(sk, t); + return 1; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) @@ -551,15 +578,20 @@ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v6_from_addr_param(union sctp_addr *addr, +static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) + return false; + addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/output.c b/net/sctp/output.c index a6aa17df09ef..4dfb5ea82b05 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -103,8 +103,9 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); - } else if (!sctp_transport_pmtu_check(tp)) { - if (asoc->param_flags & SPP_PMTUD_ENABLE) + } else if (!sctp_transport_pl_enabled(tp) && + asoc->param_flags & SPP_PMTUD_ENABLE) { + if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } @@ -211,6 +212,30 @@ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, return retval; } +/* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ +static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) +{ + struct sctp_transport *t = pkt->transport; + struct sctp_chunk *pad; + int overhead = 0; + + if (!chunk->pmtu_probe) + return SCTP_XMIT_OK; + + /* calculate the Padding Data size for the pad chunk */ + overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); + pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); + if (!pad) + return SCTP_XMIT_DELAY; + + list_add_tail(&pad->list, &pkt->chunk_list); + pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); + chunk->transport = t; + + return SCTP_XMIT_OK; +} + /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) @@ -382,6 +407,10 @@ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; retval = __sctp_packet_append_chunk(packet, chunk); + if (retval != SCTP_XMIT_OK) + goto finish; + + retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; @@ -553,7 +582,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sk = chunk->skb->sk; /* check gso */ - if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (!sk_can_gso(sk)) { pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 5cb1aa5f067b..ff47091c385e 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -769,7 +769,11 @@ static int sctp_packet_singleton(struct sctp_transport *transport, sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); - sctp_packet_append_chunk(&singleton, chunk); + if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { + list_del_init(&chunk->list); + sctp_chunk_free(chunk); + return -ENOMEM; + } return sctp_packet_transmit(&singleton, gfp); } @@ -929,8 +933,13 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) one_packet = 1; fallthrough; - case SCTP_CID_SACK: case SCTP_CID_HEARTBEAT: + if (chunk->pmtu_probe) { + sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); + break; + } + fallthrough; + case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6f2bbfeec3a4..ec0f52567c16 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -254,14 +254,19 @@ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v4_from_addr_param(union sctp_addr *addr, +static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) + return false; + addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); + + return true; } /* Initialize an address parameter from a sctp_addr and return the length @@ -393,7 +398,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || - ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { + ipv4_is_private_192(addr->v4.sin_addr.s_addr) || + ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; @@ -850,23 +856,6 @@ static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) return 0; } -static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb) -{ - struct sctp_association *asoc; - struct sctp_transport *t; - int family; - - skb->transport_header += sizeof(struct udphdr); - family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6; - sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb), - &asoc, &t); - if (!sk) - return -ENOENT; - - sctp_err_finish(sk, t); - return 0; -} - int sctp_udp_sock_start(struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; @@ -885,7 +874,7 @@ int sctp_udp_sock_start(struct net *net) tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; - tuncfg.encap_err_lookup = sctp_udp_err_lookup; + tuncfg.encap_err_lookup = sctp_udp_v4_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp4_sock = sock->sk; @@ -907,7 +896,7 @@ int sctp_udp_sock_start(struct net *net) tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; - tuncfg.encap_err_lookup = sctp_udp_err_lookup; + tuncfg.encap_err_lookup = sctp_udp_v6_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp6_sock = sock->sk; #endif @@ -1171,7 +1160,6 @@ static const struct net_protocol sctp_protocol = { .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5b44d228b6ca..b8fa8f1a7277 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1160,9 +1160,10 @@ nodata: /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, - const struct sctp_transport *transport) + const struct sctp_transport *transport, + __u32 probe_size) { - struct sctp_sender_hb_info hbinfo; + struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, @@ -1176,6 +1177,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; + hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. @@ -1183,6 +1185,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); + retval->pmtu_probe = !!probe_size; nodata: return retval; @@ -1218,6 +1221,32 @@ nodata: return retval; } +/* RFC4820 3. Padding Chunk (PAD) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0x84 | Flags=0 | Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | + * \ Padding Data / + * / \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) +{ + struct sctp_chunk *retval; + + retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); + if (!retval) + return NULL; + + skb_put_zero(retval->skb, len); + retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); + retval->chunk_end = skb_tail_pointer(retval->skb); + + return retval; +} + /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ @@ -2166,9 +2195,16 @@ static enum sctp_ierror sctp_verify_param(struct net *net, break; case SCTP_PARAM_SET_PRIMARY: - if (ep->asconf_enable) - break; - goto unhandled; + if (!ep->asconf_enable) + goto unhandled; + + if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + + sizeof(struct sctp_paramhdr)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2346,11 +2382,13 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Process the initialization parameters. */ sctp_walk_params(param, peer_init, init_hdr.params) { - if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || - param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { + if (!src_match && + (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, - chunk->sctp_hdr->source, 0); + if (!af->from_addr_param(&addr, param.addr, + chunk->sctp_hdr->source, 0)) + continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } @@ -2531,7 +2569,8 @@ static int sctp_process_param(struct sctp_association *asoc, break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) + break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) @@ -2632,15 +2671,13 @@ do_addr_param: addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - if (af == NULL) + if (!af) break; - af->from_addr_param(&addr, addr_param, - htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0)) + break; - /* if the address is invalid, we can't process it. - * XXX: see spec for what to do. - */ if (!af->addr_valid(&addr, NULL, NULL)) break; @@ -3054,7 +3091,8 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; - af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) + return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. @@ -3331,7 +3369,8 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - af->from_addr_param(&addr, addr_param, htons(bp->port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) + return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index ce15d590a615..b3815b568e8e 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -471,6 +471,38 @@ out_unlock: sctp_transport_put(transport); } +/* Handle the timeout of the probe timer. */ +void sctp_generate_probe_event(struct timer_list *t) +{ + struct sctp_transport *transport = from_timer(transport, t, probe_timer); + struct sctp_association *asoc = transport->asoc; + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + int error = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + pr_debug("%s: sock is busy\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE), + asoc->state, asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + sk->sk_err = -error; + +out_unlock: + bh_unlock_sock(sk); + sctp_transport_put(transport); +} + /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(struct timer_list *t) { @@ -1641,6 +1673,11 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, sctp_cmd_hb_timers_stop(commands, asoc); break; + case SCTP_CMD_PROBE_TIMER_UPDATE: + t = cmd->obj.transport; + sctp_transport_reset_probe_timer(t); + break; + case SCTP_CMD_REPORT_ERROR: error = cmd->obj.error; break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index fd1e319eda00..32df65f68c12 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -361,7 +361,7 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, /* If the INIT is coming toward a closing socket, we'll send back * and ABORT. Essentially, this catches the race of INIT being - * backloged to the socket at the same time as the user isses close(). + * backloged to the socket at the same time as the user issues close(). * Since the socket and all its associations are going away, we * can treat this OOTB */ @@ -608,8 +608,8 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); - /* SCTP-AUTH: genereate the assocition shared keys so that - * we can potentially signe the COOKIE-ECHO. + /* SCTP-AUTH: generate the association shared keys so that + * we can potentially sign the COOKIE-ECHO. */ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); @@ -787,7 +787,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_init; /* SCTP-AUTH: Now that we've populate required fields in - * sctp_process_init, set up the assocaition shared keys as + * sctp_process_init, set up the association shared keys as * necessary so that we can potentially authenticate the ACK */ error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC); @@ -838,7 +838,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions - * during side-effect processing and correclty count established + * during side-effect processing and correctly count established * associations. */ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); @@ -923,7 +923,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, commands); /* Reset init error count upon receipt of COOKIE-ACK, - * to avoid problems with the managemement of this + * to avoid problems with the management of this * counter in stale cookie situations when a transition back * from the COOKIE-ECHOED state to the COOKIE-WAIT * state is performed. @@ -1004,7 +1004,7 @@ static enum sctp_disposition sctp_sf_heartbeat( struct sctp_chunk *reply; /* Send a heartbeat to our peer. */ - reply = sctp_make_heartbeat(asoc, transport); + reply = sctp_make_heartbeat(asoc, transport, 0); if (!reply) return SCTP_DISPOSITION_NOMEM; @@ -1095,6 +1095,32 @@ enum sctp_disposition sctp_sf_send_reconf(struct net *net, return SCTP_DISPOSITION_CONSUME; } +/* send hb chunk with padding for PLPMUTD. */ +enum sctp_disposition sctp_sf_send_probe(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, + void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *)arg; + struct sctp_chunk *reply; + + if (!sctp_transport_pl_enabled(transport)) + return SCTP_DISPOSITION_CONSUME; + + if (sctp_transport_pl_send(transport)) { + reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size); + if (!reply) + return SCTP_DISPOSITION_NOMEM; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + } + sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE, + SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + /* * Process an heartbeat request. * @@ -1243,6 +1269,17 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net, if (hbinfo->hb_nonce != link->hb_nonce) return SCTP_DISPOSITION_DISCARD; + if (hbinfo->probe_size) { + if (hbinfo->probe_size != link->pl.probe_size || + !sctp_transport_pl_enabled(link)) + return SCTP_DISPOSITION_DISCARD; + + if (sctp_transport_pl_recv(link)) + return SCTP_DISPOSITION_CONSUME; + + return sctp_sf_send_probe(net, ep, asoc, type, link, commands); + } + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ @@ -2950,7 +2987,7 @@ enum sctp_disposition sctp_sf_do_9_2_reshutack( commands); /* Since we are not going to really process this INIT, there - * is no point in verifying chunk boundries. Just generate + * is no point in verifying chunk boundaries. Just generate * the SHUTDOWN ACK. */ reply = sctp_make_shutdown_ack(asoc, chunk); @@ -3560,7 +3597,7 @@ enum sctp_disposition sctp_sf_do_9_2_final(struct net *net, goto nomem_chunk; /* Do all the commands now (after allocation), so that we - * have consistent state if memory allocation failes + * have consistent state if memory allocation fails */ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); @@ -3747,7 +3784,7 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* We need to discard the rest of the packet to prevent - * potential bomming attacks from additional bundled chunks. + * potential boomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. */ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -4257,7 +4294,7 @@ gen_shutdown: } /* - * SCTP-AUTH Section 6.3 Receiving authenticated chukns + * SCTP-AUTH Section 6.3 Receiving authenticated chunks * * The receiver MUST use the HMAC algorithm indicated in the HMAC * Identifier field. If this algorithm was not specified by the @@ -4812,7 +4849,7 @@ static enum sctp_disposition sctp_sf_violation_ctsn( /* Handle protocol violation of an invalid chunk bundling. For example, * when we have an association and we receive bundled INIT-ACK, or - * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle" + * SHUTDOWN-COMPLETE, our peer is clearly violating the "MUST NOT bundle" * statement from the specs. Additionally, there might be an attacker * on the path and we may not want to continue this communication. */ @@ -5208,7 +5245,7 @@ enum sctp_disposition sctp_sf_cookie_wait_prm_shutdown( * Inputs * (endpoint, asoc) * - * The RFC does not explcitly address this issue, but is the route through the + * The RFC does not explicitly address this issue, but is the route through the * state table when someone issues a shutdown while in COOKIE_ECHOED state. * * Outputs @@ -5932,7 +5969,7 @@ enum sctp_disposition sctp_sf_t1_cookie_timer_expire( /* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN * with the updated last sequential TSN received from its peer. * - * An endpoint should limit the number of retransmissions of the + * An endpoint should limit the number of retransmission of the * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'. * If this threshold is exceeded the endpoint should destroy the TCB and * MUST report the peer endpoint unreachable to the upper layer (and @@ -6010,7 +6047,7 @@ nomem: } /* - * ADDIP Section 4.1 ASCONF CHunk Procedures + * ADDIP Section 4.1 ASCONF Chunk Procedures * If the T4 RTO timer expires the endpoint should do B1 to B5 */ enum sctp_disposition sctp_sf_t4_timer_expire( @@ -6441,7 +6478,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, chunk->ecn_ce_done = 1; if (af->is_ce(sctp_gso_headskb(chunk->skb))) { - /* Do real work as sideffect. */ + /* Do real work as side effect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); } diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 88ea87f4f0e7..1816a4410b2b 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -527,6 +527,26 @@ auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { }; /*state_fn_t auth_chunk_event_table[][] */ static const struct sctp_sm_table_entry +pad_chunk_event_table[SCTP_STATE_NUM_STATES] = { + /* SCTP_STATE_CLOSED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_COOKIE_WAIT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_COOKIE_ECHOED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_ESTABLISHED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_PENDING */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_SENT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_RECEIVED */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), +}; /* chunk pad */ + +static const struct sctp_sm_table_entry chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_CLOSED */ TYPE_SCTP_FUNC(sctp_sf_ootb), @@ -947,6 +967,25 @@ other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } +#define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_send_probe), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + static const struct sctp_sm_table_entry timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, @@ -958,6 +997,7 @@ timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, TYPE_SCTP_EVENT_TIMEOUT_RECONF, + TYPE_SCTP_EVENT_TIMEOUT_PROBE, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; @@ -992,6 +1032,9 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup( case SCTP_CID_AUTH: return &auth_chunk_event_table[0][state]; + + case SCTP_CID_PAD: + return &pad_chunk_event_table[state]; } return &chunk_event_table_unknown[state]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a79d193ff872..6b937bfd4751 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2496,6 +2496,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, sctp_transport_pmtu(trans, sctp_opt2sk(sp)); sctp_assoc_sync_pmtu(asoc); } + sctp_transport_pl_reset(trans); } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; @@ -4481,6 +4482,61 @@ static int sctp_setsockopt_encap_port(struct sock *sk, return 0; } +static int sctp_setsockopt_probe_interval(struct sock *sk, + struct sctp_probeinterval *params, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_transport *t; + __u32 probe_interval; + + if (optlen != sizeof(*params)) + return -EINVAL; + + probe_interval = params->spi_interval; + if (probe_interval && probe_interval < SCTP_PROBE_TIMER_MIN) + return -EINVAL; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spi_address)) { + t = sctp_addr_id2transport(sk, ¶ms->spi_address, + params->spi_assoc_id); + if (!t) + return -EINVAL; + + t->probe_interval = msecs_to_jiffies(probe_interval); + sctp_transport_pl_reset(t); + return 0; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, params->spi_assoc_id); + if (!asoc && params->spi_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) + return -EINVAL; + + /* If changes are for association, also apply probe_interval to + * each transport. + */ + if (asoc) { + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { + t->probe_interval = msecs_to_jiffies(probe_interval); + sctp_transport_pl_reset(t); + } + + asoc->probe_interval = msecs_to_jiffies(probe_interval); + return 0; + } + + sctp_sk(sk)->probe_interval = probe_interval; + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4521,6 +4577,10 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, } if (optlen > 0) { + /* Trim it to the biggest size sctp sockopt may need if necessary */ + optlen = min_t(unsigned int, optlen, + PAGE_ALIGN(USHRT_MAX + + sizeof(__u16) * sizeof(struct sctp_reset_streams))); kopt = memdup_sockptr(optval, optlen); if (IS_ERR(kopt)) return PTR_ERR(kopt); @@ -4703,6 +4763,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_setsockopt_encap_port(sk, kopt, optlen); break; + case SCTP_PLPMTUD_PROBE_INTERVAL: + retval = sctp_setsockopt_probe_interval(sk, kopt, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4989,6 +5052,7 @@ static int sctp_init_sock(struct sock *sk) atomic_set(&sp->pd_mode, 0); skb_queue_head_init(&sp->pd_lobby); sp->frag_interleave = 0; + sp->probe_interval = net->sctp.probe_interval; /* Create a per socket endpoint structure. Even if we * change the data structure relationships, this may still @@ -7905,6 +7969,66 @@ out: return 0; } +static int sctp_getsockopt_probe_interval(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_probeinterval params; + struct sctp_association *asoc; + struct sctp_transport *t; + __u32 probe_interval; + + if (len < sizeof(params)) + return -EINVAL; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spi_address)) { + t = sctp_addr_id2transport(sk, ¶ms.spi_address, + params.spi_assoc_id); + if (!t) { + pr_debug("%s: failed no transport\n", __func__); + return -EINVAL; + } + + probe_interval = jiffies_to_msecs(t->probe_interval); + goto out; + } + + /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the + * socket is a one to many style socket, and an association + * was not found, then the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spi_assoc_id); + if (!asoc && params.spi_assoc_id != SCTP_FUTURE_ASSOC && + sctp_style(sk, UDP)) { + pr_debug("%s: failed no association\n", __func__); + return -EINVAL; + } + + if (asoc) { + probe_interval = jiffies_to_msecs(asoc->probe_interval); + goto out; + } + + probe_interval = sctp_sk(sk)->probe_interval; + +out: + params.spi_interval = probe_interval; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -8128,6 +8252,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); break; + case SCTP_PLPMTUD_PROBE_INTERVAL: + retval = sctp_getsockopt_probe_interval(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 55871b277f47..b46a416787ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -55,6 +55,8 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -294,6 +296,13 @@ static struct ctl_table sctp_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "plpmtud_probe_interval", + .data = &init_net.sctp.probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_sctp_do_probe_interval, + }, + { .procname = "udp_port", .data = &init_net.sctp.udp_port, .maxlen = sizeof(int), @@ -539,6 +548,32 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, return ret; } +static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + struct ctl_table tbl; + int ret, new_value; + + memset(&tbl, 0, sizeof(struct ctl_table)); + tbl.maxlen = sizeof(unsigned int); + + if (write) + tbl.data = &new_value; + else + tbl.data = &net->sctp.probe_interval; + + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (new_value && new_value < SCTP_PROBE_TIMER_MIN) + return -EINVAL; + + net->sctp.probe_interval = new_value; + } + + return ret; +} + int sctp_sysctl_net_register(struct net *net) { struct ctl_table *table; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bf0ac467e757..a3d3ca6dd63d 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -75,6 +75,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net, timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0); timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0); timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0); + timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0); timer_setup(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, 0); @@ -131,6 +132,9 @@ void sctp_transport_free(struct sctp_transport *transport) if (del_timer(&transport->reconf_timer)) sctp_transport_put(transport); + if (del_timer(&transport->probe_timer)) + sctp_transport_put(transport); + /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) sctp_transport_put(transport); @@ -207,6 +211,15 @@ void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) sctp_transport_hold(transport); } +void sctp_transport_reset_probe_timer(struct sctp_transport *transport) +{ + if (timer_pending(&transport->probe_timer)) + return; + if (!mod_timer(&transport->probe_timer, + jiffies + transport->probe_interval)) + sctp_transport_hold(transport); +} + /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. @@ -241,12 +254,155 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = sctp_dst_mtu(transport->dst); else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; + + sctp_transport_pl_update(transport); +} + +bool sctp_transport_pl_send(struct sctp_transport *t) +{ + if (t->pl.probe_count < SCTP_MAX_PROBES) + goto out; + + t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks; + t->pl.probe_count = 0; + if (t->pl.state == SCTP_PL_BASE) { + if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ + t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ + + t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ + t->pl.state = SCTP_PL_BASE; /* Search -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_high = 0; + + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } else { /* Normal probe failure. */ + t->pl.probe_high = t->pl.probe_size; + t->pl.probe_size = t->pl.pmtu; + } + } else if (t->pl.state == SCTP_PL_COMPLETE) { + if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */ + t->pl.state = SCTP_PL_BASE; /* Search Complete -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } + +out: + if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count < 30 && + !t->pl.probe_count && t->pl.last_rtx_chunks == t->asoc->rtx_data_chunks) { + t->pl.raise_count++; + return false; + } + + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + + t->pl.probe_count++; + return true; +} + +bool sctp_transport_pl_recv(struct sctp_transport *t) +{ + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + + t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks; + t->pl.pmtu = t->pl.probe_size; + t->pl.probe_count = 0; + if (t->pl.state == SCTP_PL_BASE) { + t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */ + t->pl.probe_size += SCTP_PL_BIG_STEP; + } else if (t->pl.state == SCTP_PL_ERROR) { + t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */ + + t->pl.pmtu = t->pl.probe_size; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + t->pl.probe_size += SCTP_PL_BIG_STEP; + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (!t->pl.probe_high) { + t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, + SCTP_MAX_PLPMTU); + return false; + } + t->pl.probe_size += SCTP_PL_MIN_STEP; + if (t->pl.probe_size >= t->pl.probe_high) { + t->pl.probe_high = 0; + t->pl.raise_count = 0; + t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */ + + t->pl.probe_size = t->pl.pmtu; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + sctp_assoc_sync_pmtu(t->asoc); + } + } else if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count == 30) { + /* Raise probe_size again after 30 * interval in Search Complete */ + t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ + t->pl.probe_size += SCTP_PL_MIN_STEP; + } + + return t->pl.state == SCTP_PL_COMPLETE; +} + +static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) +{ + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu); + + if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size) + return false; + + if (t->pl.state == SCTP_PL_BASE) { + if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { + t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ + + t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } + } else if (t->pl.state == SCTP_PL_SEARCH) { + if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { + t->pl.state = SCTP_PL_BASE; /* Search -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_count = 0; + + t->pl.probe_high = 0; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { + t->pl.probe_size = pmtu; + t->pl.probe_count = 0; + + return false; + } + } else if (t->pl.state == SCTP_PL_COMPLETE) { + if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { + t->pl.state = SCTP_PL_BASE; /* Complete -> Base */ + t->pl.probe_size = SCTP_BASE_PLPMTU; + t->pl.probe_count = 0; + + t->pl.probe_high = 0; + t->pl.pmtu = SCTP_BASE_PLPMTU; + t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + } + } + + return true; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { - struct dst_entry *dst = sctp_transport_dst_check(t); struct sock *sk = t->asoc->base.sk; + struct dst_entry *dst; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { @@ -257,6 +413,10 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) } pmtu = SCTP_TRUNC4(pmtu); + if (sctp_transport_pl_enabled(t)) + return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t)); + + dst = sctp_transport_dst_check(t); if (dst) { struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); union sctp_addr addr; diff --git a/net/smc/Makefile b/net/smc/Makefile index 77e54fe42b1c..99a0186cba5b 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5eff7cccceff..c038efc23ce3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -49,6 +49,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_stats.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -508,9 +509,44 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->r0.qp_mtu; } -static void smc_switch_to_fallback(struct smc_sock *smc) +static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, + struct smc_stats_fback *fback_arr) +{ + int cnt; + + for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { + if (fback_arr[cnt].fback_code == smc->fallback_rsn) { + fback_arr[cnt].count++; + break; + } + if (!fback_arr[cnt].fback_code) { + fback_arr[cnt].fback_code = smc->fallback_rsn; + fback_arr[cnt].count++; + break; + } + } +} + +static void smc_stat_fallback(struct smc_sock *smc) +{ + struct net *net = sock_net(&smc->sk); + + mutex_lock(&net->smc.mutex_fback_rsn); + if (smc->listen_smc) { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); + net->smc.fback_rsn->srv_fback_cnt++; + } else { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); + net->smc.fback_rsn->clnt_fback_cnt++; + } + mutex_unlock(&net->smc.mutex_fback_rsn); +} + +static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { smc->use_fallback = true; + smc->fallback_rsn = reason_code; + smc_stat_fallback(smc); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; @@ -522,8 +558,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc) /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = reason_code; + smc_switch_to_fallback(smc, reason_code); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -535,9 +570,11 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { + struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; @@ -545,6 +582,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; @@ -757,7 +795,7 @@ static int smc_connect_rdma(struct smc_sock *smc, reason_code = SMC_CLC_DECL_NOSRVLINK; goto connect_abort; } - smc->conn.lnk = link; + smc_switch_link_and_count(&smc->conn, link); } /* create send buffer and rmb */ @@ -992,6 +1030,7 @@ static int __smc_connect(struct smc_sock *smc) if (rc) goto vlan_cleanup; + SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); @@ -1307,7 +1346,9 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + struct net *net = sock_net(newsmcsk); + this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; @@ -1325,8 +1366,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = reason_code; + smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1699,8 +1739,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; + smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); smc_listen_out_connected(new_smc); return; } @@ -1778,6 +1817,7 @@ static void smc_listen_work(struct work_struct *work) } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: @@ -1984,18 +2024,19 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; goto out; } } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); - else + } else { rc = smc_tx_sendmsg(smc, msg, len); + SMC_STAT_TX_PAYLOAD(smc, len, rc); + } out: release_sock(sk); return rc; @@ -2030,6 +2071,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: @@ -2176,7 +2218,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; - sk->sk_error_report(sk); + sk_error_report(sk); } if (optlen < sizeof(int)) @@ -2194,8 +2236,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -2204,18 +2245,22 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val) + if (val) { + SMC_STAT_INC(smc, ndly_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val) + if (!val) { + SMC_STAT_INC(smc, cork_cnt); mod_delayed_work(smc->conn.lgr->tx_wq, &smc->conn.tx_work, 0); + } } break; case TCP_DEFER_ACCEPT: @@ -2338,11 +2383,13 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, goto out; } release_sock(sk); - if (smc->use_fallback) + if (smc->use_fallback) { rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); - else + } else { + SMC_STAT_INC(smc, sendpage_cnt); rc = sock_no_sendpage(sock, page, offset, size, flags); + } out: return rc; @@ -2391,6 +2438,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, flags = MSG_DONTWAIT; else flags = 0; + SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: @@ -2479,6 +2527,16 @@ static void __net_exit smc_net_exit(struct net *net) smc_pnet_net_exit(net); } +static __net_init int smc_net_stat_init(struct net *net) +{ + return smc_stats_init(net); +} + +static void __net_exit smc_net_stat_exit(struct net *net) +{ + smc_stats_exit(net); +} + static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, @@ -2486,6 +2544,11 @@ static struct pernet_operations smc_net_ops = { .size = sizeof(struct smc_net), }; +static struct pernet_operations smc_net_stat_ops = { + .init = smc_net_stat_init, + .exit = smc_net_stat_exit, +}; + static int __init smc_init(void) { int rc; @@ -2494,6 +2557,10 @@ static int __init smc_init(void) if (rc) return rc; + rc = register_pernet_subsys(&smc_net_stat_ops); + if (rc) + return rc; + smc_ism_init(); smc_clc_init(); @@ -2595,6 +2662,7 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); + unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0df85a12651e..af227b65669e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -33,6 +33,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_netlink.h" +#include "smc_stats.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -916,8 +917,8 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, return rc; } -static void smc_switch_link_and_count(struct smc_connection *conn, - struct smc_link *to_lnk) +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); conn->lnk = to_lnk; @@ -1235,20 +1236,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } -static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) -{ - int i; - - for (i = 0; i < SMC_RMBE_SIZES; i++) { - struct smc_buf_desc *buf_desc; - - list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { - buf_desc->len += sizeof(struct smcd_cdc_msg); - smc_ism_unregister_dmb(lgr->smcd, buf_desc); - } - } -} - static void smc_sk_wake_ups(struct smc_sock *smc) { smc->sk.sk_write_space(&smc->sk); @@ -1285,7 +1272,6 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); - smcd_unregister_all_dmbs(lgr); } else { u32 rsn = lgr->llc_termination_rsn; @@ -1766,21 +1752,30 @@ out: return rc; } -/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + return compressed; } @@ -1996,17 +1991,12 @@ out: return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ - static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; - if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -2044,6 +2034,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + bool is_dgraded = false; struct mutex *lock; /* lock buffer list */ int sk_buf_size; @@ -2054,9 +2045,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size); + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { - if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; @@ -2065,12 +2055,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } @@ -2082,9 +2072,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (PTR_ERR(buf_desc) == -ENOMEM) break; - if (IS_ERR(buf_desc)) + if (IS_ERR(buf_desc)) { + if (!is_dgraded) { + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); + } continue; + } + SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; mutex_lock(lock); list_add(&buf_desc->list, buf_list); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6d6fd1397c87..c043ecdca5c4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -97,6 +97,7 @@ struct smc_link { unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + atomic_t wr_tx_refcnt; /* tx refs to link */ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ @@ -109,6 +110,7 @@ struct smc_link { struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ + atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ @@ -444,6 +446,8 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 7d7ba0320d5a..a8845343d183 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -753,8 +753,7 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) if (!libdev->ops.get_netdev) continue; lndev = libdev->ops.get_netdev(libdev, i + 1); - if (lndev) - dev_put(lndev); + dev_put(lndev); if (lndev != ndev) continue; if (event == NETDEV_REGISTER) diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 967712ba52a0..9cb2df289963 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -470,7 +470,6 @@ void smcd_unregister_dev(struct smcd_dev *smcd) mutex_unlock(&smcd_dev_list.mutex); smcd->going_away = 1; smc_smcd_terminate_all(smcd); - flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); device_del(&smcd->dev); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 273eaf1bfe49..2e7560eba981 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -888,6 +888,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (!rc) goto out; out_clear_lnk: + lnk_new->state = SMC_LNK_INACTIVE; smcr_link_clear(lnk_new, false); out_reject: smc_llc_cli_add_link_reject(qentry); @@ -1184,6 +1185,7 @@ int smc_llc_srv_add_link(struct smc_link *link) goto out_err; return 0; out_err: + link_new->state = SMC_LNK_INACTIVE; smcr_link_clear(link_new, false); return rc; } @@ -1286,10 +1288,8 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) del_llc->reason = 0; smc_llc_send_message(lnk, &qentry->msg); /* response */ - if (smc_link_downing(&lnk_del->state)) { - if (smc_switch_conns(lgr, lnk_del, false)) - smc_wr_tx_wait_no_pending_sends(lnk_del); - } + if (smc_link_downing(&lnk_del->state)) + smc_switch_conns(lgr, lnk_del, false); smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); @@ -1805,8 +1805,6 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); } /* register a new rtoken at the remote peer (for all links) */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 140419a19dbf..6fb6f96c1d17 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -19,6 +19,7 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_ib.h" +#include "smc_stats.h" #include "smc_netlink.h" #define SMC_CMD_MAX_ATTR 1 @@ -55,6 +56,16 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smcr_nl_get_device, }, + { + .cmd = SMC_NETLINK_GET_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_stats, + }, + { + .cmd = SMC_NETLINK_GET_FBACK_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_fback_stats, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index 3477265cba6c..5ce2c0a89ccd 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -18,7 +18,7 @@ extern struct genl_family smc_gen_nl_family; struct smc_nl_dmp_ctx { - int pos[2]; + int pos[3]; }; static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 6f6d33edb135..4a964e9190b0 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -394,8 +394,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, return 0; out_put: - if (ndev) - dev_put(ndev); + dev_put(ndev); return rc; } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index fcfac59f8b72..170b733bc736 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -21,6 +21,7 @@ #include "smc_cdc.h" #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" +#include "smc_stats.h" /* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). @@ -227,6 +228,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, conn->urg_state == SMC_URG_READ) return -EINVAL; + SMC_STAT_INC(smc, urg_data_cnt); if (conn->urg_state == SMC_URG_VALID) { if (!(flags & MSG_PEEK)) smc->conn.urg_state = SMC_URG_READ; @@ -303,6 +305,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + readable = atomic_read(&conn->bytes_to_rcv); + if (readable >= conn->rmb_desc->len) + SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); + + if (len < readable) + SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c new file mode 100644 index 000000000000..e80e34f7ac15 --- /dev/null +++ b/net/smc/smc_stats.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC statistics netlink routines + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> +#include <net/genetlink.h> +#include <net/sock.h> +#include "smc_netlink.h" +#include "smc_stats.h" + +int smc_stats_init(struct net *net) +{ + net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + if (!net->smc.fback_rsn) + goto err_fback; + net->smc.smc_stats = alloc_percpu(struct smc_stats); + if (!net->smc.smc_stats) + goto err_stats; + mutex_init(&net->smc.mutex_fback_rsn); + return 0; + +err_stats: + kfree(net->smc.fback_rsn); +err_fback: + return -ENOMEM; +} + +void smc_stats_exit(struct net *net) +{ + kfree(net->smc.fback_rsn); + if (net->smc.smc_stats) + free_percpu(net->smc.smc_stats); +} + +static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_rmbcnt *stats_rmb_cnt; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TX_RMB_STATS) + stats_rmb_cnt = &stats->smc[tech].rmb_tx; + else + stats_rmb_cnt = &stats->smc[tech].rmb_rx; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, + stats_rmb_cnt->reuse_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, + stats_rmb_cnt->buf_size_small_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, + stats_rmb_cnt->buf_size_small_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, + stats_rmb_cnt->buf_full_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, + stats_rmb_cnt->buf_full_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, + stats_rmb_cnt->alloc_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, + stats_rmb_cnt->dgrade_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_memsize *stats_pload; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) + stats_pload = &stats->smc[tech].tx_pd; + else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) + stats_pload = &stats->smc[tech].rx_pd; + else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) + stats_pload = &stats->smc[tech].tx_rmbsize; + else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) + stats_pload = &stats->smc[tech].rx_rmbsize; + else + goto errout; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, + stats_pload->buf[SMC_BUF_8K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, + stats_pload->buf[SMC_BUF_16K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, + stats_pload->buf[SMC_BUF_32K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, + stats_pload->buf[SMC_BUF_64K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, + stats_pload->buf[SMC_BUF_128K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, + stats_pload->buf[SMC_BUF_256K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, + stats_pload->buf[SMC_BUF_512K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, + stats_pload->buf[SMC_BUF_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, + stats_pload->buf[SMC_BUF_G_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, + struct smc_stats *stats, int tech) +{ + struct smc_stats_tech *smc_tech; + struct nlattr *attrs; + + smc_tech = &stats->smc[tech]; + if (tech == SMC_TYPE_D) + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); + else + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); + + if (!attrs) + goto errout; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_SIZE)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, + smc_tech->clnt_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, + smc_tech->clnt_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, + smc_tech->srv_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, + smc_tech->srv_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, + smc_tech->rx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, + smc_tech->tx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, + smc_tech->rx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, + smc_tech->tx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, + smc_tech->sendpage_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, + smc_tech->cork_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, + smc_tech->ndly_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, + smc_tech->splice_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, + smc_tech->urg_data_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +int smc_nl_get_stats(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + struct smc_stats *stats; + struct nlattr *attrs; + int cpu, i, size; + void *nlh; + u64 *src; + u64 *sum; + + if (cb_ctx->pos[0]) + goto errmsg; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_STATS); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_STATS); + if (!attrs) + goto errnest; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + goto erralloc; + size = sizeof(*stats) / sizeof(u64); + for_each_possible_cpu(cpu) { + src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); + sum = (u64 *)stats; + for (i = 0; i < size; i++) + *(sum++) += *(src++); + } + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) + goto errattr; + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, + stats->clnt_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, + stats->srv_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + kfree(stats); + return skb->len; + +errattr: + kfree(stats); +erralloc: + nla_nest_cancel(skb, attrs); +errnest: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_get_fback_details(struct sk_buff *skb, + struct netlink_callback *cb, int pos, + bool is_srv) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int cnt_reported = cb_ctx->pos[2]; + struct smc_stats_fback *trgt_arr; + struct nlattr *attrs; + int rc = 0; + void *nlh; + + if (is_srv) + trgt_arr = &net->smc.fback_rsn->srv[0]; + else + trgt_arr = &net->smc.fback_rsn->clnt[0]; + if (!trgt_arr[pos].fback_code) + return -ENODATA; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_FBACK_STATS); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) + goto errattr; + if (!cnt_reported) { + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, + net->smc.fback_rsn->srv_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, + net->smc.fback_rsn->clnt_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + cnt_reported = 1; + } + + if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, + trgt_arr[pos].fback_code)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count)) + goto errattr; + + cb_ctx->pos[2] = cnt_reported; + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return rc; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int rc_srv = 0, rc_clnt = 0, k; + int skip_serv = cb_ctx->pos[1]; + int snum = cb_ctx->pos[0]; + bool is_srv = true; + + mutex_lock(&net->smc.mutex_fback_rsn); + for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { + if (k < snum) + continue; + if (!skip_serv) { + rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); + if (rc_srv && rc_srv != -ENODATA) + break; + } else { + skip_serv = 0; + } + rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); + if (rc_clnt && rc_clnt != -ENODATA) { + skip_serv = 1; + break; + } + if (rc_clnt == -ENODATA && rc_srv == -ENODATA) + break; + } + mutex_unlock(&net->smc.mutex_fback_rsn); + cb_ctx->pos[1] = skip_serv; + cb_ctx->pos[0] = k; + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h new file mode 100644 index 000000000000..84b7ecd8c05c --- /dev/null +++ b/net/smc/smc_stats.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Macros for SMC statistics + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ + +#ifndef NET_SMC_SMC_STATS_H_ +#define NET_SMC_SMC_STATS_H_ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> + +#include "smc_clc.h" + +#define SMC_MAX_FBACK_RSN_CNT 30 + +enum { + SMC_BUF_8K, + SMC_BUF_16K, + SMC_BUF_32K, + SMC_BUF_64K, + SMC_BUF_128K, + SMC_BUF_256K, + SMC_BUF_512K, + SMC_BUF_1024K, + SMC_BUF_G_1024K, + SMC_BUF_MAX, +}; + +struct smc_stats_fback { + int fback_code; + u16 count; +}; + +struct smc_stats_rsn { + struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; + struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; + u64 srv_fback_cnt; + u64 clnt_fback_cnt; +}; + +struct smc_stats_rmbcnt { + u64 buf_size_small_peer_cnt; + u64 buf_size_small_cnt; + u64 buf_full_peer_cnt; + u64 buf_full_cnt; + u64 reuse_cnt; + u64 alloc_cnt; + u64 dgrade_cnt; +}; + +struct smc_stats_memsize { + u64 buf[SMC_BUF_MAX]; +}; + +struct smc_stats_tech { + struct smc_stats_memsize tx_rmbsize; + struct smc_stats_memsize rx_rmbsize; + struct smc_stats_memsize tx_pd; + struct smc_stats_memsize rx_pd; + struct smc_stats_rmbcnt rmb_tx; + struct smc_stats_rmbcnt rmb_rx; + u64 clnt_v1_succ_cnt; + u64 clnt_v2_succ_cnt; + u64 srv_v1_succ_cnt; + u64 srv_v2_succ_cnt; + u64 sendpage_cnt; + u64 urg_data_cnt; + u64 splice_cnt; + u64 cork_cnt; + u64 ndly_cnt; + u64 rx_bytes; + u64 tx_bytes; + u64 rx_cnt; + u64 tx_cnt; +}; + +struct smc_stats { + struct smc_stats_tech smc[2]; + u64 clnt_hshake_err_cnt; + u64 srv_hshake_err_cnt; +}; + +#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ +do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_tech) t = (_tech); \ + typeof(_len) l = (_len); \ + int _pos = fls64((l) >> 13); \ + typeof(_rc) r = (_rc); \ + int m = SMC_BUF_MAX - 1; \ + this_cpu_inc((*stats).smc[t].key ## _cnt); \ + if (r <= 0) \ + break; \ + _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*stats).smc[t].key ## _bytes, r); \ +} \ +while (0) + +#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +do { \ + typeof(_len) _l = (_len); \ + typeof(_tech) t = (_tech); \ + int _pos = fls((_l) >> 13); \ + int m = SMC_BUF_MAX - 1; \ + _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ +} \ +while (0) + +#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ + this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) + +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +do { \ + struct net *_net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + typeof(_len) l = (_len); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ +} \ +while (0) + +#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \ +do { \ + struct net *net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \ +} \ +while (0) + +#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, reuse, is_smcd, is_rx) + +#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, alloc, is_smcd, is_rx) + +#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx) + +#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, false) + +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, false) + +#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, true) + +#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, true) + +#define SMC_STAT_INC(_smc, type) \ +do { \ + typeof(_smc) __smc = _smc; \ + bool is_smcd = !(__smc)->conn.lnk; \ + struct net *net = sock_net(&(__smc)->sk); \ + struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \ + if ((is_smcd)) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ + else \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \ +} \ +while (0) + +#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \ +do { \ + typeof(_aclc) acl = (_aclc); \ + bool is_v2 = (acl->hdr.version == SMC_V2); \ + bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \ +} \ +while (0) + +#define SMC_STAT_SERV_SUCC_INC(net, _ini) \ +do { \ + typeof(_ini) i = (_ini); \ + bool is_v2 = (i->smcd_version & SMC_V2); \ + bool is_smcd = (i->is_smcd); \ + typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \ +} \ +while (0) + +int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_stats_init(struct net *net); +void smc_stats_exit(struct net *net); + +#endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 4532c16bf85e..c79361dfcdfb 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -27,6 +27,7 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" +#include "smc_stats.h" #define SMC_TX_WORK_DELAY 0 #define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ @@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -151,9 +154,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(smc, urg_data_cnt); + while (msg_data_left(msg)) { - if (sk->sk_state == SMC_INIT) - return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) @@ -419,8 +432,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); - if (rmbespace <= 0) + if (rmbespace <= 0) { + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; + } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); @@ -479,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -533,6 +550,22 @@ out_unlock: return rc; } +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_link *link = conn->lnk; + int rc = -ENOLINK; + + if (!link) + return rc; + + atomic_inc(&link->wr_tx_refcnt); + if (smc_link_usable(link)) + rc = _smcr_tx_sndbuf_nonempty(conn); + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); + return rc; +} + static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cbc73a7e4d59..a419e9af36b9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -322,9 +322,12 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (rc) return rc; + atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); + if (atomic_dec_and_test(&link->wr_reg_refcnt)) + wake_up_all(&link->wr_reg_wait); if (!rc) { /* timeout - terminate link */ smcr_link_down_cond_sched(link); @@ -566,10 +569,15 @@ void smc_wr_free_link(struct smc_link *lnk) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_wakeup_reg_wait(lnk); + smc_wr_wakeup_tx_wait(lnk); + if (smc_wr_tx_wait_no_pending_sends(lnk)) memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); + wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, @@ -728,7 +736,9 @@ int smc_wr_create_link(struct smc_link *lnk) memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); init_waitqueue_head(&lnk->wr_tx_wait); + atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); + atomic_set(&lnk->wr_reg_refcnt, 0); return rc; dma_unmap: diff --git a/net/socket.c b/net/socket.c index 4f2c6d2795d0..7f64a6eccf63 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,7 @@ #include <linux/sockios.h> #include <net/busy_poll.h> #include <linux/errqueue.h> +#include <linux/ptp_clock_kernel.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -165,6 +166,55 @@ static const struct file_operations socket_file_ops = { .show_fdinfo = sock_show_fdinfo, }; +static const char * const pf_family_names[] = { + [PF_UNSPEC] = "PF_UNSPEC", + [PF_UNIX] = "PF_UNIX/PF_LOCAL", + [PF_INET] = "PF_INET", + [PF_AX25] = "PF_AX25", + [PF_IPX] = "PF_IPX", + [PF_APPLETALK] = "PF_APPLETALK", + [PF_NETROM] = "PF_NETROM", + [PF_BRIDGE] = "PF_BRIDGE", + [PF_ATMPVC] = "PF_ATMPVC", + [PF_X25] = "PF_X25", + [PF_INET6] = "PF_INET6", + [PF_ROSE] = "PF_ROSE", + [PF_DECnet] = "PF_DECnet", + [PF_NETBEUI] = "PF_NETBEUI", + [PF_SECURITY] = "PF_SECURITY", + [PF_KEY] = "PF_KEY", + [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", + [PF_PACKET] = "PF_PACKET", + [PF_ASH] = "PF_ASH", + [PF_ECONET] = "PF_ECONET", + [PF_ATMSVC] = "PF_ATMSVC", + [PF_RDS] = "PF_RDS", + [PF_SNA] = "PF_SNA", + [PF_IRDA] = "PF_IRDA", + [PF_PPPOX] = "PF_PPPOX", + [PF_WANPIPE] = "PF_WANPIPE", + [PF_LLC] = "PF_LLC", + [PF_IB] = "PF_IB", + [PF_MPLS] = "PF_MPLS", + [PF_CAN] = "PF_CAN", + [PF_TIPC] = "PF_TIPC", + [PF_BLUETOOTH] = "PF_BLUETOOTH", + [PF_IUCV] = "PF_IUCV", + [PF_RXRPC] = "PF_RXRPC", + [PF_ISDN] = "PF_ISDN", + [PF_PHONET] = "PF_PHONET", + [PF_IEEE802154] = "PF_IEEE802154", + [PF_CAIF] = "PF_CAIF", + [PF_ALG] = "PF_ALG", + [PF_NFC] = "PF_NFC", + [PF_VSOCK] = "PF_VSOCK", + [PF_KCM] = "PF_KCM", + [PF_QIPCRTR] = "PF_QIPCRTR", + [PF_SMC] = "PF_SMC", + [PF_XDP] = "PF_XDP", + [PF_MCTP] = "PF_MCTP", +}; + /* * The protocol list. Each protocol is registered in here. */ @@ -825,12 +875,18 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && - !skb_is_swtx_tstamp(skb, false_tstamp) && - ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { - empty = 0; - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && - !skb_is_err_queue(skb)) - put_ts_pktinfo(msg, skb); + !skb_is_swtx_tstamp(skb, false_tstamp)) { + if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + ptp_convert_timestamp(shhwtstamps, sk->sk_bind_phc); + + if (ktime_to_timespec64_cond(shhwtstamps->hwtstamp, + tss.ts + 2)) { + empty = 0; + + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + !skb_is_err_queue(skb)) + put_ts_pktinfo(msg, skb); + } } if (!empty) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) @@ -1009,9 +1065,13 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg); +static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg); -void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) +void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; @@ -1019,6 +1079,22 @@ void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) } EXPORT_SYMBOL(brioctl_set); +int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) +{ + int err = -ENOPKG; + + if (!br_ioctl_hook) + request_module("bridge"); + + mutex_lock(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(net, br, cmd, ifr, uarg); + mutex_unlock(&br_ioctl_mutex); + + return err; +} + static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); @@ -1033,8 +1109,11 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + struct ifreq ifr; + bool need_copyout; int err; void __user *argp = (void __user *)arg; + void __user *data; err = sock->ops->ioctl(sock, cmd, arg); @@ -1045,25 +1124,16 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, if (err != -ENOIOCTLCMD) return err; - if (cmd == SIOCGIFCONF) { - struct ifconf ifc; - if (copy_from_user(&ifc, argp, sizeof(struct ifconf))) - return -EFAULT; - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct ifreq)); - rtnl_unlock(); - if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf))) - err = -EFAULT; - } else { - struct ifreq ifr; - bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (!is_socket_ioctl_cmd(cmd)) + return -ENOTTY; + + if (get_user_ifreq(&ifr, &data, argp)) + return -EFAULT; + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); + if (!err && need_copyout) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); - if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) - return -EFAULT; - } + return err; } @@ -1085,12 +1155,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; + void __user *data; bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE @@ -1115,14 +1186,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = -ENOPKG; - if (!br_ioctl_hook) - request_module("bridge"); - - mutex_lock(&br_ioctl_mutex); - if (br_ioctl_hook) - err = br_ioctl_hook(net, cmd, argp); - mutex_unlock(&br_ioctl_mutex); + err = br_ioctl_call(net, NULL, cmd, NULL, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -1162,6 +1226,11 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) cmd == SIOCGSTAMP_NEW, false); break; + + case SIOCGIFCONF: + err = dev_ifconf(net, argp); + break; + default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -1667,32 +1736,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -int __sys_accept4_file(struct file *file, unsigned file_flags, +struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags, - unsigned long nofile) + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd; + int err, len; struct sockaddr_storage address; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file); - if (!sock) { - err = -ENOTSOCK; - goto out; - } + if (!sock) + return ERR_PTR(-ENOTSOCK); - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = sock->type; newsock->ops = sock->ops; @@ -1703,18 +1762,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = __get_unused_fd_flags(flags, nofile); - if (unlikely(newfd < 0)) { - err = newfd; - sock_release(newsock); - goto out; - } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - put_unused_fd(newfd); - goto out; - } + if (IS_ERR(newfile)) + return newfile; err = security_socket_accept(sock, newsock); if (err) @@ -1739,16 +1789,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, } /* File flags are not inherited via accept() unlike another OSes. */ - - fd_install(newfd, newfile); - err = newfd; -out: - return err; + return newfile; out_fd: fput(newfile); - put_unused_fd(newfd); - goto out; + return ERR_PTR(err); +} +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags, + unsigned long nofile) +{ + struct file *newfile; + int newfd; + + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + newfd = __get_unused_fd_flags(flags, nofile); + if (unlikely(newfd < 0)) + return newfd; + + newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen, + flags); + if (IS_ERR(newfile)) { + put_unused_fd(newfd); + return PTR_ERR(newfile); + } + fd_install(newfd, newfile); + return newfd; } /* @@ -2975,7 +3047,7 @@ int sock_register(const struct net_proto_family *ops) } spin_unlock(&net_family_lock); - pr_info("NET: Registered protocol family %d\n", ops->family); + pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); @@ -3003,7 +3075,7 @@ void sock_unregister(int family) synchronize_rcu(); - pr_info("NET: Unregistered protocol family %d\n", family); + pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]); } EXPORT_SYMBOL(sock_unregister); @@ -3071,154 +3143,55 @@ void socket_seq_show(struct seq_file *seq) } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_COMPAT -static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) +/* Handle the fact that while struct ifreq has the same *layout* on + * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, + * which are handled elsewhere, it still has different *size* due to + * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, + * resulting in struct ifreq being 32 and 40 bytes respectively). + * As a result, if the struct happens to be at the end of a page and + * the next page isn't readable/writable, we get a fault. To prevent + * that, copy back and forth to the full size. + */ +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { - struct compat_ifconf ifc32; - struct ifconf ifc; - int err; + if (in_compat_syscall()) { + struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; - if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) - return -EFAULT; + memset(ifr, 0, sizeof(*ifr)); + if (copy_from_user(ifr32, arg, sizeof(*ifr32))) + return -EFAULT; - ifc.ifc_len = ifc32.ifc_len; - ifc.ifc_req = compat_ptr(ifc32.ifcbuf); + if (ifrdata) + *ifrdata = compat_ptr(ifr32->ifr_data); - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq)); - rtnl_unlock(); - if (err) - return err; + return 0; + } - ifc32.ifc_len = ifc.ifc_len; - if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) + if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; + if (ifrdata) + *ifrdata = ifr->ifr_data; + return 0; } +EXPORT_SYMBOL(get_user_ifreq); -static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) +int put_user_ifreq(struct ifreq *ifr, void __user *arg) { - struct compat_ethtool_rxnfc __user *compat_rxnfc; - bool convert_in = false, convert_out = false; - size_t buf_size = 0; - struct ethtool_rxnfc __user *rxnfc = NULL; - struct ifreq ifr; - u32 rule_cnt = 0, actual_rule_cnt; - u32 ethcmd; - u32 data; - int ret; + size_t size = sizeof(*ifr); - if (get_user(data, &ifr32->ifr_ifru.ifru_data)) - return -EFAULT; - - compat_rxnfc = compat_ptr(data); - - if (get_user(ethcmd, &compat_rxnfc->cmd)) - return -EFAULT; - - /* Most ethtool structures are defined without padding. - * Unfortunately struct ethtool_rxnfc is an exception. - */ - switch (ethcmd) { - default: - break; - case ETHTOOL_GRXCLSRLALL: - /* Buffer size is variable */ - if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) - return -EFAULT; - if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) - return -ENOMEM; - buf_size += rule_cnt * sizeof(u32); - fallthrough; - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_SRXCLSRLINS: - convert_out = true; - fallthrough; - case ETHTOOL_SRXCLSRLDEL: - buf_size += sizeof(struct ethtool_rxnfc); - convert_in = true; - rxnfc = compat_alloc_user_space(buf_size); - break; - } + if (in_compat_syscall()) + size = sizeof(struct compat_ifreq); - if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + if (copy_to_user(arg, ifr, size)) return -EFAULT; - ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc; - - if (convert_in) { - /* We expect there to be holes between fs.m_ext and - * fs.ring_cookie and at the end of fs, but nowhere else. - */ - BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + - sizeof(compat_rxnfc->fs.m_ext) != - offsetof(struct ethtool_rxnfc, fs.m_ext) + - sizeof(rxnfc->fs.m_ext)); - BUILD_BUG_ON( - offsetof(struct compat_ethtool_rxnfc, fs.location) - - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != - offsetof(struct ethtool_rxnfc, fs.location) - - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); - - if (copy_in_user(rxnfc, compat_rxnfc, - (void __user *)(&rxnfc->fs.m_ext + 1) - - (void __user *)rxnfc) || - copy_in_user(&rxnfc->fs.ring_cookie, - &compat_rxnfc->fs.ring_cookie, - (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie)) - return -EFAULT; - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - if (put_user(rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - } else if (copy_in_user(&rxnfc->rule_cnt, - &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - } - - ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL); - if (ret) - return ret; - - if (convert_out) { - if (copy_in_user(compat_rxnfc, rxnfc, - (const void __user *)(&rxnfc->fs.m_ext + 1) - - (const void __user *)rxnfc) || - copy_in_user(&compat_rxnfc->fs.ring_cookie, - &rxnfc->fs.ring_cookie, - (const void __user *)(&rxnfc->fs.location + 1) - - (const void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - /* As an optimisation, we only copy the actual - * number of rules that the underlying - * function returned. Since Mallory might - * change the rule count in user memory, we - * check that it is less than the rule count - * originally given (as the user buffer size), - * which has been range-checked. - */ - if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - if (actual_rule_cnt < rule_cnt) - rule_cnt = actual_rule_cnt; - if (copy_in_user(&compat_rxnfc->rule_locs[0], - &rxnfc->rule_locs[0], - rule_cnt * sizeof(u32))) - return -EFAULT; - } - } - return 0; } +EXPORT_SYMBOL(put_user_ifreq); +#ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; @@ -3226,7 +3199,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 void __user *saved; int err; - if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq))) + if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) @@ -3235,10 +3208,10 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); - err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL); + err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; - if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq))) + if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; @@ -3249,97 +3222,15 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; - u32 data32; - - if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ)) - return -EFAULT; - if (get_user(data32, &u_ifreq32->ifr_data)) - return -EFAULT; - ifreq.ifr_data = compat_ptr(data32); - - return dev_ioctl(net, cmd, &ifreq, NULL); -} - -static int compat_ifreq_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq __user *uifr; - int err; - - /* Handle the fact that while struct ifreq has the same *layout* on - * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, - * which are handled elsewhere, it still has different *size* due to - * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, - * resulting in struct ifreq being 32 and 40 bytes respectively). - * As a result, if the struct happens to be at the end of a page and - * the next page isn't readable/writable, we get a fault. To prevent - * that, copy back and forth to the full size. - */ - - uifr = compat_alloc_user_space(sizeof(*uifr)); - if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) - return -EFAULT; - - err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); - - if (!err) { - switch (cmd) { - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFMEM: - case SIOCGIFHWADDR: - case SIOCGIFINDEX: - case SIOCGIFADDR: - case SIOCGIFBRDADDR: - case SIOCGIFDSTADDR: - case SIOCGIFNETMASK: - case SIOCGIFPFLAGS: - case SIOCGIFTXQLEN: - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCGIFNAME: - if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) - err = -EFAULT; - break; - } - } - return err; -} + void __user *data; -static int compat_sioc_ifmap(struct net *net, unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq ifr; - struct compat_ifmap __user *uifmap32; - int err; - - uifmap32 = &uifr32->ifr_ifru.ifru_map; - err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= get_user(ifr.ifr_map.port, &uifmap32->port); - if (err) + if (!is_socket_ioctl_cmd(cmd)) + return -ENOTTY; + if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; + ifreq.ifr_data = data; - err = dev_ioctl(net, cmd, &ifr, NULL); - - if (cmd == SIOCGIFMAP && !err) { - err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= put_user(ifr.ifr_map.port, &uifmap32->port); - if (err) - err = -EFAULT; - } - return err; + return dev_ioctl(net, cmd, &ifreq, data, NULL); } /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE @@ -3365,21 +3256,14 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, struct net *net = sock_net(sk); if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) - return compat_ifr_data_ioctl(net, cmd, argp); + return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCSIFBR: case SIOCGIFBR: return old_bridge_ioctl(argp); - case SIOCGIFCONF: - return compat_dev_ifconf(net, argp); - case SIOCETHTOOL: - return ethtool_ioctl(net, argp); case SIOCWANDEV: return compat_siocwandev(net, argp); - case SIOCGIFMAP: - case SIOCSIFMAP: - return compat_sioc_ifmap(net, cmd, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) @@ -3387,6 +3271,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); + case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: @@ -3404,10 +3289,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: + case SIOCGIFCONF: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: + case SIOCGIFMAP: + case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: @@ -3444,8 +3332,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - return compat_ifreq_ioctl(net, sock, cmd, argp); - case SIOCSARP: case SIOCGARP: case SIOCDARP: diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index b3815c1e8f2e..9c0343568d2a 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -58,7 +58,7 @@ static void strp_abort_strp(struct strparser *strp, int err) /* Report an error on the lower socket */ sk->sk_err = -err; - sk->sk_error_report(sk); + sk_error_report(sk); } } diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 9488600451e8..1c8de397d6ad 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o \ + sunrpc_syms.o cache.o rpc_pipe.o sysfs.o \ svc_xprt.o \ xprtmultipath.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index d1c003a25b0f..61c276bddaf2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -160,7 +160,7 @@ static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 6dff64374bfe..3e776e3dff91 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -194,6 +194,8 @@ static void rsi_request(struct cache_detail *cd, qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; + WARN_ONCE(*blen < 0, + "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, @@ -707,11 +709,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) /* * Verify the checksum on the header and return SVC_OK on success. * Otherwise, return SVC_DROP (in the case of a bad sequence number) - * or return SVC_DENIED and indicate error in authp. + * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat. */ static int gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, - __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp) + __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct gss_ctx *ctx_id = rsci->mechctx; struct xdr_buf rpchdr; @@ -725,7 +727,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; if (argv->iov_len < 4) return SVC_DENIED; flavor = svc_getnl(argv); @@ -737,13 +739,13 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) @@ -1038,6 +1040,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; + rqstp->rq_auth_stat = rpc_autherr_badcred; + /* * A gss export can be specified either by: * export *(sec=krb5,rw) @@ -1053,6 +1057,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } @@ -1142,7 +1148,7 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token) } static int gss_read_proxy_verf(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp, + struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { @@ -1151,7 +1157,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, int pages, i, res, pgto, pgfrom; size_t inlen, to_offs, from_offs; - res = gss_read_common_verf(gc, argv, authp, in_handle); + res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle); if (res) return res; @@ -1227,7 +1233,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1236,7 +1242,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); - ret = gss_read_verf(gc, argv, authp, + ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat, &rsikey.in_handle, &rsikey.in_token); if (ret) return ret; @@ -1275,7 +1281,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; - int status = -EINVAL; + int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ @@ -1339,7 +1345,7 @@ out: } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj cli_handle; @@ -1351,8 +1357,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); - ret = gss_read_proxy_verf(rqstp, gc, authp, - &ud.in_handle, &ud.in_token); + ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; @@ -1525,7 +1530,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {} * response here and return SVC_COMPLETE. */ static int -svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_gss_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1538,7 +1543,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) @@ -1575,22 +1580,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) goto auth_err; - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (use_gss_proxy(SVC_NET(rqstp))) - return svcauth_gss_proxy_init(rqstp, gc, authp); + return svcauth_gss_proxy_init(rqstp, gc); else - return svcauth_gss_legacy_init(rqstp, gc, authp); + return svcauth_gss_legacy_init(rqstp, gc); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: /* Look up the context, and check the verifier: */ - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; - switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: @@ -1600,7 +1605,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) } break; default: - *authp = rpc_autherr_rejectedcred; + rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } @@ -1616,13 +1621,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_SUCCESS); goto complete; case RPC_GSS_PROC_DATA: - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; @@ -1980,7 +1985,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 1a2c1c44bb00..59641803472c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -803,7 +803,7 @@ static int cache_request(struct cache_detail *detail, detail->cache_request(detail, crq->item, &bp, &len); if (len < 0) - return -EAGAIN; + return -E2BIG; return PAGE_SIZE - len; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 42623d6b8f0e..f056ff931444 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -41,6 +41,7 @@ #include <trace/events/sunrpc.h> #include "sunrpc.h" +#include "sysfs.h" #include "netns.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -166,7 +167,7 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; - if (atomic_read(&clnt->cl_count) == 0) + if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: @@ -327,6 +328,7 @@ err_auth: out: if (pipefs_sb) rpc_put_sb_net(net); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); return err; } @@ -410,24 +412,26 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, } rpc_clnt_set_transport(clnt, xprt, timeout); + xprt->main = true; xprt_iter_init(&clnt->cl_xpi, xps); xprt_switch_put(xps); clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); - atomic_set(&clnt->cl_count, 1); + refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; /* save the nodename */ rpc_clnt_set_nodename(clnt, nodename); + rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt)); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; if (parent) - atomic_inc(&parent->cl_count); + refcount_inc(&parent->cl_count); trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); return clnt; @@ -733,6 +737,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); /* @@ -879,6 +884,7 @@ static void rpc_free_client_work(struct work_struct *work) * so they cannot be called in rpciod, so they are handled separately * here. */ + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); rpc_free_clid(clnt); rpc_clnt_remove_pipedir(clnt); @@ -912,18 +918,16 @@ rpc_free_client(struct rpc_clnt *clnt) static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { - if (clnt->cl_auth == NULL) - return rpc_free_client(clnt); - /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ - atomic_inc(&clnt->cl_count); - rpcauth_release(clnt->cl_auth); - clnt->cl_auth = NULL; - if (atomic_dec_and_test(&clnt->cl_count)) + if (clnt->cl_auth != NULL) { + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } @@ -937,7 +941,7 @@ rpc_release_client(struct rpc_clnt *clnt) do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); - if (!atomic_dec_and_test(&clnt->cl_count)) + if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); @@ -1076,7 +1080,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) if (clnt != NULL) { rpc_task_set_transport(task, clnt); task->tk_client = clnt; - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_softerr) @@ -2100,6 +2104,30 @@ call_connect_status(struct rpc_task *task) case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: + if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) && + (task->tk_flags & RPC_TASK_MOVEABLE) && + test_bit(XPRT_REMOVE, &xprt->state)) { + struct rpc_xprt *saved = task->tk_xprt; + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); + if (xps->xps_nxprts > 1) { + long value; + + xprt_release(task); + value = atomic_long_dec_return(&xprt->queuelen); + if (value == 0) + rpc_xprt_switch_remove_xprt(xps, saved); + xprt_put(saved); + task->tk_xprt = NULL; + task->tk_action = call_start; + } + xprt_switch_put(xps); + if (!task->tk_xprt) + return; + } goto out_retry; case -ENOBUFS: rpc_delay(task, HZ >> 2); @@ -2664,17 +2692,18 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; -static int rpc_ping(struct rpc_clnt *clnt) +static void +rpc_null_call_prepare(struct rpc_task *task, void *data) { - struct rpc_message msg = { - .rpc_proc = &rpcproc_null, - }; - int err; - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | - RPC_TASK_NULLCREDS); - return err; + task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; + rpc_call_start(task); } +static const struct rpc_call_ops rpc_null_ops = { + .rpc_call_prepare = rpc_null_call_prepare, + .rpc_call_done = rpc_default_callback, +}; + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, @@ -2688,7 +2717,7 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, .rpc_xprt = xprt, .rpc_message = &msg, .rpc_op_cred = cred, - .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, + .callback_ops = ops ?: &rpc_null_ops, .callback_data = data, .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, @@ -2703,6 +2732,19 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int } EXPORT_SYMBOL_GPL(rpc_call_null); +static int rpc_ping(struct rpc_clnt *clnt) +{ + struct rpc_task *task; + int status; + + task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; @@ -2726,6 +2768,7 @@ static void rpc_cb_add_xprt_release(void *calldata) } static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { + .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; @@ -2744,6 +2787,15 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; + if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + rcu_read_lock(); + pr_warn("SUNRPC: reached max allowed number (%d) did not add " + "transport to server: %s\n", clnt->cl_max_connect, + rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + return -EINVAL; + } + data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; @@ -2756,7 +2808,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); - + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: return 1; diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 56029e3af6ff..7dc9cc929bfd 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -8,14 +8,14 @@ #include <linux/debugfs.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> + #include "netns.h" +#include "fail.h" static struct dentry *topdir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; -unsigned int rpc_inject_disconnect; - static int tasks_show(struct seq_file *f, void *v) { @@ -90,7 +90,7 @@ static int tasks_open(struct inode *inode, struct file *filp) struct seq_file *seq = filp->private_data; struct rpc_clnt *clnt = seq->private = inode->i_private; - if (!atomic_inc_not_zero(&clnt->cl_count)) { + if (!refcount_inc_not_zero(&clnt->cl_count)) { seq_release(inode, filp); ret = -EINVAL; } @@ -235,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) /* make tasks file */ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops); - - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -246,56 +244,30 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } -static int -fault_open(struct inode *inode, struct file *filp) -{ - filp->private_data = kmalloc(128, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +struct fail_sunrpc_attr fail_sunrpc = { + .attr = FAULT_ATTR_INITIALIZER, +}; +EXPORT_SYMBOL_GPL(fail_sunrpc); -static int -fault_release(struct inode *inode, struct file *filp) +static void fail_sunrpc_init(void) { - kfree(filp->private_data); - return 0; -} + struct dentry *dir; -static ssize_t -fault_disconnect_read(struct file *filp, char __user *user_buf, - size_t len, loff_t *offset) -{ - char *buffer = (char *)filp->private_data; - size_t size; + dir = fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); - size = sprintf(buffer, "%u\n", rpc_inject_disconnect); - return simple_read_from_buffer(user_buf, len, offset, buffer, size); -} + debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_client_disconnect); -static ssize_t -fault_disconnect_write(struct file *filp, const char __user *user_buf, - size_t len, loff_t *offset) + debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_server_disconnect); +} +#else +static void fail_sunrpc_init(void) { - char buffer[16]; - - if (len >= sizeof(buffer)) - len = sizeof(buffer) - 1; - if (copy_from_user(buffer, user_buf, len)) - return -EFAULT; - buffer[len] = '\0'; - if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) - return -EINVAL; - return len; } - -static const struct file_operations fault_disconnect_fops = { - .owner = THIS_MODULE, - .open = fault_open, - .read = fault_disconnect_read, - .write = fault_disconnect_write, - .release = fault_release, -}; +#endif void __exit sunrpc_debugfs_exit(void) @@ -309,16 +281,11 @@ sunrpc_debugfs_exit(void) void __init sunrpc_debugfs_init(void) { - struct dentry *rpc_fault_dir; - topdir = debugfs_create_dir("sunrpc", NULL); rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir); - rpc_fault_dir = debugfs_create_dir("inject_fault", topdir); - - debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, - &fault_disconnect_fops); + fail_sunrpc_init(); } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h new file mode 100644 index 000000000000..69dc30cc44b8 --- /dev/null +++ b/net/sunrpc/fail.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021, Oracle. All rights reserved. + */ + +#ifndef _NET_SUNRPC_FAIL_H_ +#define _NET_SUNRPC_FAIL_H_ + +#include <linux/fault-inject.h> + +#if IS_ENABLED(CONFIG_FAULT_INJECTION) + +struct fail_sunrpc_attr { + struct fault_attr attr; + + bool ignore_client_disconnect; + + bool ignore_server_disconnect; +}; + +extern struct fail_sunrpc_attr fail_sunrpc; + +#endif /* CONFIG_FAULT_INJECTION */ + +#endif /* _NET_SUNRPC_FAIL_H_ */ diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 09c000d490a1..ee5336d73fdd 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -423,7 +423,7 @@ rpc_info_open(struct inode *inode, struct file *file) spin_lock(&file->f_path.dentry->d_lock); if (!d_unhashed(file->f_path.dentry)) clnt = RPC_I(inode)->private; - if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) { + if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) { spin_unlock(&file->f_path.dentry->d_lock); m->private = clnt; } else { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 39ed0e0afe6d..c045f63d11fa 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -592,10 +592,20 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q struct rpc_task *task; /* + * Service the privileged queue. + */ + q = &queue->tasks[RPC_NR_PRIORITY - 1]; + if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; + } + + /* * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q) && --queue->nr) { + if (!list_empty(q) && queue->nr) { + queue->nr--; task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 236fadc4a439..691c0000e9ea 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,6 +24,7 @@ #include <linux/sunrpc/xprtsock.h> #include "sunrpc.h" +#include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; @@ -103,6 +104,10 @@ init_sunrpc(void) if (err) goto out4; + err = rpc_sysfs_init(); + if (err) + goto out5; + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); @@ -111,6 +116,8 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; +out5: + unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: @@ -124,7 +131,10 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_sysfs_exit(); rpc_cleanup_clids(); + xprt_cleanup_ids(); + xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0de918cb3d90..a3bbe5ce4570 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -31,6 +31,8 @@ #include <trace/events/sunrpc.h> +#include "fail.h" + #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); @@ -838,6 +840,27 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser } EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +/** + * svc_rqst_replace_page - Replace one page in rq_pages[] + * @rqstp: svc_rqst with pages to replace + * @page: replacement page + * + * When replacing a page in rq_pages, batch the release of the + * replaced pages to avoid hammering the page allocator. + */ +void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +{ + if (*rqstp->rq_next_page) { + if (!pagevec_space(&rqstp->rq_pvec)) + __pagevec_release(&rqstp->rq_pvec); + pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + } + + get_page(page); + *(rqstp->rq_next_page++) = page; +} +EXPORT_SYMBOL_GPL(svc_rqst_replace_page); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. @@ -1163,22 +1186,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif -__be32 -svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err) -{ - set_bit(RQ_AUTHERR, &rqstp->rq_flags); - return auth_err; -} -EXPORT_SYMBOL_GPL(svc_return_autherr); - -static __be32 -svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp) -{ - if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags)) - return *statp; - return rpc_auth_ok; -} - static int svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -1202,7 +1209,7 @@ svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) test_bit(RQ_DROPME, &rqstp->rq_flags)) return 0; - if (test_bit(RQ_AUTHERR, &rqstp->rq_flags)) + if (rqstp->rq_auth_stat != rpc_auth_ok) return 1; if (*statp != rpc_success) @@ -1283,7 +1290,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) struct svc_process_info process; __be32 *statp; u32 prog, vers; - __be32 auth_stat, rpc_stat; + __be32 rpc_stat; int auth_res; __be32 *reply_statp; @@ -1326,14 +1333,12 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * We do this before anything else in order to get a decent * auth verifier. */ - auth_res = svc_authenticate(rqstp, &auth_stat); + auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK && progp) { - auth_stat = rpc_autherr_badcred; + if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); - } if (auth_res != SVC_OK) - trace_svc_authenticate(rqstp, auth_res, auth_stat); + trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; @@ -1392,15 +1397,15 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto release_dropit; if (*statp == rpc_garbage_args) goto err_garbage; - auth_stat = svc_get_autherr(rqstp, statp); - if (auth_stat != rpc_auth_ok) - goto err_release_bad_auth; } else { dprintk("svc: calling dispatcher\n"); if (!process.dispatch(rqstp, statp)) goto release_dropit; /* Release reply info */ } + if (rqstp->rq_auth_stat != rpc_auth_ok) + goto err_release_bad_auth; + /* Check RPC status result */ if (*statp != rpc_success) resv->iov_len = ((void*)statp) - resv->iov_base + 4; @@ -1450,13 +1455,14 @@ err_release_bad_auth: if (procp->pc_release) procp->pc_release(rqstp); err_bad_auth: - dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + dprintk("svc: authentication failed (%d)\n", + be32_to_cpu(rqstp->rq_auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ - svc_putnl(resv, ntohl(auth_stat)); /* status */ + svc_putu32(resv, rqstp->rq_auth_stat); /* status */ goto sendit; err_bad_prog: @@ -1503,6 +1509,12 @@ svc_process(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; u32 dir; +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + if (!fail_sunrpc.ignore_server_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + svc_xprt_deferred_close(rqstp->rq_xprt); +#endif + /* * Setup response xdr_buf. * Initially it has just one page @@ -1630,6 +1642,21 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** + * svc_proc_name - Return RPC procedure name in string form + * @rqstp: svc_rqst to operate on + * + * Return value: + * Pointer to a NUL-terminated string + */ +const char *svc_proc_name(const struct svc_rqst *rqstp) +{ + if (rqstp && rqstp->rq_procinfo) + return rqstp->rq_procinfo->pc_name; + return "unknown"; +} + + +/** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d66a8e44a1ae..6316bd2b8f37 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -539,6 +539,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; + pagevec_release(&rqstp->rq_pvec); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -662,7 +663,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled; + unsigned long pages, filled, ret; + + pagevec_init(&rqstp->rq_pvec); pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { @@ -672,11 +675,12 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) pages = RPCSVC_MAXPAGES; } - for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); - if (filled == pages) - break; + for (filled = 0; filled < pages; filled = ret) { + ret = alloc_pages_bulk_array(GFP_KERNEL, pages, + rqstp->rq_pages); + if (ret > filled) + /* Made progress, don't sleep yet */ + continue; set_current_state(TASK_INTERRUPTIBLE); if (signalled() || kthread_should_stop()) { @@ -835,7 +839,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); - } + } else + svc_xprt_received(xprt); out: trace_svc_handle_xprt(xprt, len); return len; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 998b196b6176..5a8b8e03fdd4 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -59,12 +59,12 @@ svc_put_auth_ops(struct auth_ops *aops) } int -svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) +svc_authenticate(struct svc_rqst *rqstp) { rpc_authflavor_t flavor; struct auth_ops *aops; - *authp = rpc_auth_ok; + rqstp->rq_auth_stat = rpc_auth_ok; flavor = svc_getnl(&rqstp->rq_arg.head[0]); @@ -72,7 +72,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) aops = svc_get_auth_ops(flavor); if (aops == NULL) { - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } @@ -80,7 +80,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; - return aops->accept(rqstp, authp); + return aops->accept(rqstp); } EXPORT_SYMBOL_GPL(svc_authenticate); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 35b7966ac3b3..d7ed7d49115a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -681,8 +681,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_client = NULL; if (rqstp->rq_proc == 0) - return SVC_OK; + goto out; + rqstp->rq_auth_stat = rpc_autherr_badcred; ipm = ip_map_cached_get(xprt); if (ipm == NULL) ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, @@ -719,13 +720,16 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) put_group_info(cred->cr_group_info); cred->cr_group_info = gi; } + +out: + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } EXPORT_SYMBOL_GPL(svcauth_unix_set_client); static int -svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_null_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -736,12 +740,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) if (svc_getu32(argv) != 0) { dprintk("svc: bad null cred\n"); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { dprintk("svc: bad null verf\n"); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -785,7 +789,7 @@ struct auth_ops svcauth_null = { static int -svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_unix_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -827,7 +831,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) } groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -839,7 +843,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_OK; badcred: - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c new file mode 100644 index 000000000000..9a6f17e18f73 --- /dev/null +++ b/net/sunrpc/sysfs.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#include <linux/sunrpc/clnt.h> +#include <linux/kobject.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/xprtsock.h> + +#include "sysfs.h" + +struct xprt_addr { + const char *addr; + struct rcu_head rcu; +}; + +static void free_xprt_addr(struct rcu_head *head) +{ + struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); + + kfree(addr->addr); + kfree(addr); +} + +static struct kset *rpc_sunrpc_kset; +static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; + +static void rpc_sysfs_object_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_ns_type_operations * +rpc_sysfs_object_child_ns_type(struct kobject *kobj) +{ + return &net_ns_type_operations; +} + +static struct kobj_type rpc_sysfs_object_type = { + .release = rpc_sysfs_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .child_ns_type = rpc_sysfs_object_child_ns_type, +}; + +static struct kobject *rpc_sysfs_object_alloc(const char *name, + struct kset *kset, + struct kobject *parent) +{ + struct kobject *kobj; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (kobj) { + kobj->kset = kset; + if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, + parent, "%s", name) == 0) + return kobj; + kobject_put(kobj); + } + return NULL; +} + +static inline struct rpc_xprt * +rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_get(x->xprt); +} + +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_switch_get(x->xprt_switch); +} + +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *x = container_of(kobj, + struct rpc_sysfs_xprt_switch, kobject); + + return xprt_switch_get(x->xprt_switch); +} + +static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) + return 0; + ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt_put(xprt); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr_storage saddr; + struct sock_xprt *sock; + ssize_t ret = -1; + + if (!xprt) + return 0; + + sock = container_of(xprt, struct sock_xprt, xprt); + if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) + goto out; + + ret = sprintf(buf, "%pISc\n", &saddr); +out: + xprt_put(xprt); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) + return 0; + + ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" + "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" + "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" + "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" + "tasks_queuelen=%ld\ndst_port=%s\n", + xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, + xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, + xprt->sending.qlen, xprt->pending.qlen, + xprt->backlog.qlen, xprt->main, + (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? + get_srcport(xprt) : 0, + atomic_long_read(&xprt->queuelen), + (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? + xprt->address_strings[RPC_DISPLAY_PORT] : "0"); + xprt_put(xprt); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + int locked, connected, connecting, close_wait, bound, binding, + closing, congested, cwnd_wait, write_space, offline, remove; + + if (!xprt) + return 0; + + if (!xprt->state) { + ret = sprintf(buf, "state=CLOSED\n"); + } else { + locked = test_bit(XPRT_LOCKED, &xprt->state); + connected = test_bit(XPRT_CONNECTED, &xprt->state); + connecting = test_bit(XPRT_CONNECTING, &xprt->state); + close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); + bound = test_bit(XPRT_BOUND, &xprt->state); + binding = test_bit(XPRT_BINDING, &xprt->state); + closing = test_bit(XPRT_CLOSING, &xprt->state); + congested = test_bit(XPRT_CONGESTED, &xprt->state); + cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); + write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); + offline = test_bit(XPRT_OFFLINE, &xprt->state); + remove = test_bit(XPRT_REMOVE, &xprt->state); + + ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", + locked ? "LOCKED" : "", + connected ? "CONNECTED" : "", + connecting ? "CONNECTING" : "", + close_wait ? "CLOSE_WAIT" : "", + bound ? "BOUND" : "", + binding ? "BOUNDING" : "", + closing ? "CLOSING" : "", + congested ? "CONGESTED" : "", + cwnd_wait ? "CWND_WAIT" : "", + write_space ? "WRITE_SPACE" : "", + offline ? "OFFLINE" : "", + remove ? "REMOVE" : ""); + } + + xprt_put(xprt); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt_switch) + return 0; + ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" + "num_unique_destaddr=%u\nqueue_len=%ld\n", + xprt_switch->xps_nxprts, xprt_switch->xps_nactive, + xprt_switch->xps_nunique_destaddr_xprts, + atomic_long_read(&xprt_switch->xps_queuelen)); + xprt_switch_put(xprt_switch); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr *saddr; + char *dst_addr; + int port; + struct xprt_addr *saved_addr; + size_t buf_len; + + if (!xprt) + return 0; + if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || + xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { + xprt_put(xprt); + return -EOPNOTSUPP; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + saddr = (struct sockaddr *)&xprt->addr; + port = rpc_get_port(saddr); + + /* buf_len is the len until the first occurence of either + * '\n' or '\0' + */ + buf_len = strcspn(buf, "\n"); + + dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); + if (!dst_addr) + goto out_err; + saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); + if (!saved_addr) + goto out_err_free; + saved_addr->addr = + rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); + rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); + call_rcu(&saved_addr->rcu, free_xprt_addr); + xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, + sizeof(*saddr)); + rpc_set_port(saddr, port); + + xprt_force_disconnect(xprt); +out: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + return count; +out_err_free: + kfree(dst_addr); +out_err: + count = -ENOMEM; + goto out; +} + +static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + int offline = 0, online = 0, remove = 0; + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt) + return 0; + + if (!strncmp(buf, "offline", 7)) + offline = 1; + else if (!strncmp(buf, "online", 6)) + online = 1; + else if (!strncmp(buf, "remove", 6)) + remove = 1; + else + return -EINVAL; + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + if (offline) { + set_bit(XPRT_OFFLINE, &xprt->state); + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } else if (online) { + clear_bit(XPRT_OFFLINE, &xprt->state); + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } else if (remove) { + if (test_bit(XPRT_OFFLINE, &xprt->state)) { + set_bit(XPRT_REMOVE, &xprt->state); + xprt_force_disconnect(xprt); + if (test_bit(XPRT_CONNECTED, &xprt->state)) { + if (!xprt->sending.qlen && + !xprt->pending.qlen && + !xprt->backlog.qlen && + !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt); + } + } else { + count = -EINVAL; + } + } + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); + return count; +} + +int rpc_sysfs_init(void) +{ + rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); + if (!rpc_sunrpc_kset) + return -ENOMEM; + rpc_sunrpc_client_kobj = + rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_client_kobj) + goto err_client; + rpc_sunrpc_xprt_switch_kobj = + rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_xprt_switch_kobj) + goto err_switch; + return 0; +err_switch: + kobject_put(rpc_sunrpc_client_kobj); + rpc_sunrpc_client_kobj = NULL; +err_client: + kset_unregister(rpc_sunrpc_kset); + rpc_sunrpc_kset = NULL; + return -ENOMEM; +} + +static void rpc_sysfs_client_release(struct kobject *kobj) +{ + struct rpc_sysfs_client *c; + + c = container_of(kobj, struct rpc_sysfs_client, kobject); + kfree(c); +} + +static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *xprt_switch; + + xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); + kfree(xprt_switch); +} + +static void rpc_sysfs_xprt_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *xprt; + + xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); + kfree(xprt); +} + +static const void *rpc_sysfs_client_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_client, kobject)->net; +} + +static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; +} + +static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt, + kobject)->xprt->xprt_net; +} + +static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, + 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); + +static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, + 0644, rpc_sysfs_xprt_srcaddr_show, NULL); + +static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, + 0444, rpc_sysfs_xprt_info_show, NULL); + +static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, + 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); + +static struct attribute *rpc_sysfs_xprt_attrs[] = { + &rpc_sysfs_xprt_dstaddr.attr, + &rpc_sysfs_xprt_srcaddr.attr, + &rpc_sysfs_xprt_info.attr, + &rpc_sysfs_xprt_change_state.attr, + NULL, +}; + +static struct kobj_attribute rpc_sysfs_xprt_switch_info = + __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); + +static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { + &rpc_sysfs_xprt_switch_info.attr, + NULL, +}; + +static struct kobj_type rpc_sysfs_client_type = { + .release = rpc_sysfs_client_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_client_namespace, +}; + +static struct kobj_type rpc_sysfs_xprt_switch_type = { + .release = rpc_sysfs_xprt_switch_release, + .default_attrs = rpc_sysfs_xprt_switch_attrs, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_switch_namespace, +}; + +static struct kobj_type rpc_sysfs_xprt_type = { + .release = rpc_sysfs_xprt_release, + .default_attrs = rpc_sysfs_xprt_attrs, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_namespace, +}; + +void rpc_sysfs_exit(void) +{ + kobject_put(rpc_sunrpc_client_kobj); + kobject_put(rpc_sunrpc_xprt_switch_kobj); + kset_unregister(rpc_sunrpc_kset); +} + +static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, + struct net *net, + int clid) +{ + struct rpc_sysfs_client *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, + parent, "clnt-%d", clid) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +static struct rpc_sysfs_xprt_switch * +rpc_sysfs_xprt_switch_alloc(struct kobject *parent, + struct rpc_xprt_switch *xprt_switch, + struct net *net, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, + &rpc_sysfs_xprt_switch_type, + parent, "switch-%d", + xprt_switch->xps_id) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (!p) + goto out; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, + parent, "xprt-%d-%s", xprt->id, + xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) + return p; + kobject_put(&p->kobject); +out: + return NULL; +} + +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net) +{ + struct rpc_sysfs_client *rpc_client; + + rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, + net, clnt->cl_clid); + if (rpc_client) { + char name[] = "switch"; + struct rpc_sysfs_xprt_switch *xswitch = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + int ret; + + clnt->cl_sysfs = rpc_client; + rpc_client->clnt = clnt; + rpc_client->xprt_switch = xprt_switch; + kobject_uevent(&rpc_client->kobject, KOBJ_ADD); + ret = sysfs_create_link_nowarn(&rpc_client->kobject, + &xswitch->kobject, name); + if (ret) + pr_warn("can't create link to %s in sysfs (%d)\n", + name, ret); + } +} + +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch; + struct net *net; + + if (xprt_switch->xps_net) + net = xprt_switch->xps_net; + else + net = xprt->xprt_net; + rpc_xprt_switch = + rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, + xprt_switch, net, gfp_flags); + if (rpc_xprt_switch) { + xprt_switch->xps_sysfs = rpc_xprt_switch; + rpc_xprt_switch->xprt_switch = xprt_switch; + rpc_xprt_switch->xprt = xprt; + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); + } +} + +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *rpc_xprt; + struct rpc_sysfs_xprt_switch *switch_obj = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + + rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); + if (rpc_xprt) { + xprt->xprt_sysfs = rpc_xprt; + rpc_xprt->xprt = xprt; + rpc_xprt->xprt_switch = xprt_switch; + kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); + } +} + +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) +{ + struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; + + if (rpc_client) { + char name[] = "switch"; + + sysfs_remove_link(&rpc_client->kobject, name); + kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); + kobject_del(&rpc_client->kobject); + kobject_put(&rpc_client->kobject); + clnt->cl_sysfs = NULL; + } +} + +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; + + if (rpc_xprt_switch) { + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt_switch->kobject); + kobject_put(&rpc_xprt_switch->kobject); + xprt_switch->xps_sysfs = NULL; + } +} + +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) +{ + struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; + + if (rpc_xprt) { + kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt->kobject); + kobject_put(&rpc_xprt->kobject); + xprt->xprt_sysfs = NULL; + } +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h new file mode 100644 index 000000000000..6620cebd1037 --- /dev/null +++ b/net/sunrpc/sysfs.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#ifndef __SUNRPC_SYSFS_H +#define __SUNRPC_SYSFS_H + +struct rpc_sysfs_client { + struct kobject kobject; + struct net *net; + struct rpc_clnt *clnt; + struct rpc_xprt_switch *xprt_switch; +}; + +struct rpc_sysfs_xprt_switch { + struct kobject kobject; + struct net *net; + struct rpc_xprt_switch *xprt_switch; + struct rpc_xprt *xprt; +}; + +struct rpc_sysfs_xprt { + struct kobject kobject; + struct rpc_xprt *xprt; + struct rpc_xprt_switch *xprt_switch; +}; + +int rpc_sysfs_init(void); +void rpc_sysfs_exit(void); + +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net); +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt); +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt); + +#endif diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 3964ff74ee51..ca10ba2626f2 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1230,10 +1230,9 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) { - base = maxlen; - maxlen = 0; - } else + if (base >= maxlen) + return 0; + else maxlen -= base; if (len > maxlen) len = maxlen; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3509a7f139b9..cfd681700d1a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -55,6 +55,8 @@ #include <trace/events/sunrpc.h> #include "sunrpc.h" +#include "sysfs.h" +#include "fail.h" /* * Local variables @@ -443,7 +445,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); -static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task != task) return; @@ -760,6 +762,20 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) EXPORT_SYMBOL_GPL(xprt_disconnect_done); /** + * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call + * @xprt: transport to disconnect + */ +static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(xprtiod_workqueue, &xprt->task_cleanup); + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + xprt->snd_task, -ENOTCONN); +} + +/** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect * @@ -770,13 +786,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task) - rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -816,11 +826,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } @@ -854,6 +860,19 @@ xprt_init_autodisconnect(struct timer_list *t) queue_work(xprtiod_workqueue, &xprt->task_cleanup); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!fail_sunrpc.ignore_client_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) @@ -865,12 +884,14 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: spin_unlock(&xprt->transport_lock); return ret; } +EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { @@ -880,12 +901,14 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; + clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } +EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation @@ -1746,6 +1769,30 @@ static void xprt_free_all_slots(struct rpc_xprt *xprt) } } +static DEFINE_IDA(rpc_xprt_ids); + +void xprt_cleanup_ids(void) +{ + ida_destroy(&rpc_xprt_ids); +} + +static int xprt_alloc_id(struct rpc_xprt *xprt) +{ + int id; + + id = ida_simple_get(&rpc_xprt_ids, 0, 0, GFP_KERNEL); + if (id < 0) + return id; + + xprt->id = id; + return 0; +} + +static void xprt_free_id(struct rpc_xprt *xprt) +{ + ida_simple_remove(&rpc_xprt_ids, xprt->id); +} + struct rpc_xprt *xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_alloc) @@ -1758,6 +1805,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, if (xprt == NULL) goto out; + xprt_alloc_id(xprt); xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { @@ -1786,6 +1834,8 @@ void xprt_free(struct rpc_xprt *xprt) { put_net(xprt->xprt_net); xprt_free_all_slots(xprt); + xprt_free_id(xprt); + rpc_sysfs_xprt_destroy(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 78c075a68c04..1693f81aae37 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -7,18 +7,20 @@ * Trond Myklebust <trond.myklebust@primarydata.com> * */ +#include <linux/atomic.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/slab.h> -#include <asm/cmpxchg.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> +#include "sysfs.h" + typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur); @@ -55,6 +57,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); + rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL); } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, @@ -62,7 +65,8 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, { if (unlikely(xprt == NULL)) return; - xps->xps_nactive--; + if (!test_bit(XPRT_OFFLINE, &xprt->state)) + xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; @@ -86,6 +90,30 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, xprt_put(xprt); } +static DEFINE_IDA(rpc_xprtswitch_ids); + +void xprt_multipath_cleanup_ids(void) +{ + ida_destroy(&rpc_xprtswitch_ids); +} + +static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) +{ + int id; + + id = ida_simple_get(&rpc_xprtswitch_ids, 0, 0, gfp_flags); + if (id < 0) + return id; + + xps->xps_id = id; + return 0; +} + +static void xprt_switch_free_id(struct rpc_xprt_switch *xps) +{ + ida_simple_remove(&rpc_xprtswitch_ids, xps->xps_id); +} + /** * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt @@ -103,12 +131,16 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); + xprt_switch_alloc_id(xps, gfp_flags); xps->xps_nxprts = xps->xps_nactive = 0; atomic_long_set(&xps->xps_queuelen, 0); xps->xps_net = NULL; INIT_LIST_HEAD(&xps->xps_xprt_list); xps->xps_iter_ops = &rpc_xprt_iter_singular; + rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); + xps->xps_nunique_destaddr_xprts = 1; + rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } return xps; @@ -136,6 +168,8 @@ static void xprt_switch_free(struct kref *kref) struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); + rpc_sysfs_xprt_switch_destroy(xps); + xprt_switch_free_id(xps); kfree_rcu(xps, xps_rcu); } @@ -198,7 +232,8 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) static bool xprt_is_active(const struct rpc_xprt *xprt) { - return kref_read(&xprt->kref) != 0; + return (kref_read(&xprt->kref) != 0 && + !test_bit(XPRT_OFFLINE, &xprt->state)); } static diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1151efd09b27..17f174d6ea3b 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; return 0; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 229fcc9a9064..f700b34a5bfd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -394,6 +394,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; unsigned int num_wrs; + int ret; num_wrs = 1; post_wr = send_wr; @@ -420,7 +421,10 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) } trace_xprtrdma_post_send(req); - return ib_post_send(ep->re_id->qp, post_wr, NULL); + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); + return ret; } /** @@ -557,6 +561,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } /** @@ -653,4 +661,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * retransmission. */ rpcrdma_unpin_rqst(req->rl_reply); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 5238bc829235..e27433f08ca7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -35,6 +35,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; @@ -53,19 +54,19 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } @@ -83,14 +84,18 @@ out_noctx: return NULL; } -static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt) +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -101,9 +106,10 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } @@ -171,20 +177,35 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. @@ -248,8 +269,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); @@ -304,9 +324,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; @@ -483,7 +501,7 @@ out_overflow: * @iov: kvec to write * * Returns: - * On succes, returns zero + * On success, returns zero * %-E2BIG if the client-provided Write chunk is too small * %-ENOMEM if a resource has been exhausted * %-EIO if an rdma-rw error occurred @@ -504,7 +522,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info, * @length: number of bytes to write * * Returns: - * On succes, returns zero + * On success, returns zero * %-E2BIG if the client-provided Write chunk is too small * %-ENOMEM if a resource has been exhausted * %-EIO if an rdma-rw error occurred @@ -526,7 +544,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info, * @data: pointer to write arguments * * Returns: - * On succes, returns zero + * On success, returns zero * %-E2BIG if the client-provided Write chunk is too small * %-ENOMEM if a resource has been exhausted * %-EIO if an rdma-rw error occurred diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d6bbafb773e1..599021b2391d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) -{ - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); -} - static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, struct rpc_rdma_cid *cid) { @@ -182,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -204,12 +198,13 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: @@ -253,9 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +} + +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); } /** @@ -275,11 +282,9 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_wake_send_waiters(rdma, 1); complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d94b7759ada1..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -136,9 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); @@ -545,7 +545,6 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -553,12 +552,6 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 19a49d26b1e4..16e5696314a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -73,6 +73,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; +static struct xprt_class xprt_rdma; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -249,12 +250,9 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt->stat.connect_start; xprt_set_connected(xprt); rc = -EAGAIN; - } else { - /* Force a call to xprt_rdma_close to clean up */ - spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - spin_unlock(&xprt->transport_lock); - } + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); xprt_wake_pending_tasks(xprt, rc); } @@ -349,6 +347,7 @@ xprt_setup_rdma(struct xprt_create *args) /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xprt_rdma; xprt->addrlen = args->addrlen; memcpy(&xprt->addr, sap, xprt->addrlen); @@ -487,6 +486,8 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); + delay = 0; if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); @@ -659,7 +660,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 649c23518ec0..aaec3c9be8db 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -124,7 +124,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * connection is closed or lost. (The important thing is it needs * to be invoked "at least" once). */ -static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) { if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) xprt_force_disconnect(ep->re_xprt); @@ -1350,21 +1350,6 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_post_sends - Post WRs to a transport's Send Queue - * @r_xprt: controlling transport instance - * @req: rpcrdma_req containing the Send WR to post - * - * Returns 0 if the post was successful, otherwise -ENOTCONN - * is returned. - */ -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) -{ - if (frwr_send(r_xprt, req)) - return -ENOTCONN; - return 0; -} - -/** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance * @needed: current credit grant @@ -1416,12 +1401,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); - if (atomic_dec_return(&ep->re_receiving) > 0) - complete(&ep->re_done); - -out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { + trace_xprtrdma_post_recvs_err(r_xprt, rc); for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; @@ -1431,6 +1412,11 @@ out: --count; } } + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + +out: + trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5d231d94e944..d91f54eae00b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -454,11 +454,11 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep); void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 316d04945587..04f1b78bcbca 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -91,6 +91,11 @@ static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static struct ctl_table_header *sunrpc_table_header; +static struct xprt_class xs_local_transport; +static struct xprt_class xs_udp_transport; +static struct xprt_class xs_tcp_transport; +static struct xprt_class xs_bc_tcp_transport; + /* * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports @@ -1648,6 +1653,13 @@ static int xs_get_srcport(struct sock_xprt *transport) return port; } +unsigned short get_srcport(struct rpc_xprt *xprt) +{ + struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); + return xs_sock_getport(sock->sock); +} +EXPORT_SYMBOL(get_srcport); + static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) @@ -1689,7 +1701,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); if (err == 0) { - transport->srcport = port; + if (transport->xprt.reuseport) + transport->srcport = port; break; } last = port; @@ -2086,13 +2099,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) if (sock == NULL) return; + if (!xprt->reuseport) { + xs_close(xprt); + return; + } switch (skst) { - default: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + break; + case TCP_ESTABLISHED: + case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; - case TCP_CLOSE: - case TCP_TIME_WAIT: + default: xs_reset_transport(transport); } } @@ -2779,6 +2799,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = 0; + xprt->xprt_class = &xs_local_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -2848,6 +2869,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_UDP; + xprt->xprt_class = &xs_udp_transport; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -2928,6 +2950,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -3001,6 +3024,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_bc_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->timeout = &xs_tcp_default_timeout; @@ -3132,24 +3156,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 89a36db47ab4..0b2c18efc079 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -378,22 +378,283 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); +struct switchdev_nested_priv { + bool (*check_cb)(const struct net_device *dev); + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + const struct net_device *dev; + struct net_device *lower_dev; +}; + +static int switchdev_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + struct switchdev_nested_priv *switchdev_priv = priv->data; + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + bool (*check_cb)(const struct net_device *dev); + const struct net_device *dev; + + check_cb = switchdev_priv->check_cb; + foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; + dev = switchdev_priv->dev; + + if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { + switchdev_priv->lower_dev = lower_dev; + return 1; + } + + return 0; +} + +static struct net_device * +switchdev_lower_dev_find(struct net_device *dev, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev)) +{ + struct switchdev_nested_priv switchdev_priv = { + .check_cb = check_cb, + .foreign_dev_check_cb = foreign_dev_check_cb, + .dev = dev, + .lower_dev = NULL, + }; + struct netdev_nested_priv priv = { + .data = &switchdev_priv, + }; + + netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); + + return switchdev_priv.lower_dev; +} + +static int __switchdev_handle_fdb_add_to_device(struct net_device *dev, + const struct net_device *orig_dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + const struct switchdev_notifier_info *info = &fdb_info->info; + struct net_device *br, *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) + return add_cb(dev, orig_dev, info->ctx, fdb_info); + + if (netif_is_lag_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + goto maybe_bridged_with_us; + + /* This is a LAG interface that we offload */ + if (!lag_add_cb) + return -EOPNOTSUPP; + + return lag_add_cb(dev, orig_dev, info->ctx, fdb_info); + } + + /* Recurse through lower interfaces in case the FDB entry is pointing + * towards a bridge device. + */ + if (netif_is_bridge_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + return 0; + + /* This is a bridge interface that we offload */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; + + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_add_to_device(lower_dev, orig_dev, + fdb_info, check_cb, + foreign_dev_check_cb, + add_cb, lag_add_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return 0; + } + +maybe_bridged_with_us: + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + br = netdev_master_upper_dev_get_rcu(dev); + if (!br || !netif_is_bridge_master(br)) + return 0; + + if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + return 0; + + return __switchdev_handle_fdb_add_to_device(br, orig_dev, fdb_info, + check_cb, foreign_dev_check_cb, + add_cb, lag_add_cb); +} + +int switchdev_handle_fdb_add_to_device(struct net_device *dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_add_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + int err; + + err = __switchdev_handle_fdb_add_to_device(dev, dev, fdb_info, + check_cb, + foreign_dev_check_cb, + add_cb, lag_add_cb); + if (err == -EOPNOTSUPP) + err = 0; + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_fdb_add_to_device); + +static int __switchdev_handle_fdb_del_to_device(struct net_device *dev, + const struct net_device *orig_dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + const struct switchdev_notifier_info *info = &fdb_info->info; + struct net_device *br, *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) + return del_cb(dev, orig_dev, info->ctx, fdb_info); + + if (netif_is_lag_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + goto maybe_bridged_with_us; + + /* This is a LAG interface that we offload */ + if (!lag_del_cb) + return -EOPNOTSUPP; + + return lag_del_cb(dev, orig_dev, info->ctx, fdb_info); + } + + /* Recurse through lower interfaces in case the FDB entry is pointing + * towards a bridge device. + */ + if (netif_is_bridge_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + return 0; + + /* This is a bridge interface that we offload */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; + + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_del_to_device(lower_dev, orig_dev, + fdb_info, check_cb, + foreign_dev_check_cb, + del_cb, lag_del_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return 0; + } + +maybe_bridged_with_us: + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + br = netdev_master_upper_dev_get_rcu(dev); + if (!br || !netif_is_bridge_master(br)) + return 0; + + if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + return 0; + + return __switchdev_handle_fdb_del_to_device(br, orig_dev, fdb_info, + check_cb, foreign_dev_check_cb, + del_cb, lag_del_cb); +} + +int switchdev_handle_fdb_del_to_device(struct net_device *dev, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_del_cb)(struct net_device *dev, + const struct net_device *orig_dev, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + int err; + + err = __switchdev_handle_fdb_del_to_device(dev, dev, fdb_info, + check_cb, + foreign_dev_check_cb, + del_cb, lag_del_cb); + if (err == -EOPNOTSUPP) + err = 0; + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_fdb_del_to_device); + static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { + struct switchdev_notifier_info *info = &port_obj_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - extack = switchdev_notifier_info_to_extack(&port_obj_info->info); + extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { - err = add_cb(dev, port_obj_info->obj, extack); + err = add_cb(dev, info->ctx, port_obj_info->obj, extack); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; @@ -422,7 +683,7 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev, int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*add_cb)(struct net_device *dev, + int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)) { @@ -439,15 +700,16 @@ EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_add); static int __switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { + struct switchdev_notifier_info *info = &port_obj_info->info; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; if (check_cb(dev)) { - err = del_cb(dev, port_obj_info->obj); + err = del_cb(dev, info->ctx, port_obj_info->obj); if (err != -EOPNOTSUPP) port_obj_info->handled = true; return err; @@ -476,7 +738,7 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev, int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), - int (*del_cb)(struct net_device *dev, + int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)) { int err; @@ -492,19 +754,20 @@ EXPORT_SYMBOL_GPL(switchdev_handle_port_obj_del); static int __switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { + struct switchdev_notifier_info *info = &port_attr_info->info; struct netlink_ext_ack *extack; struct net_device *lower_dev; struct list_head *iter; int err = -EOPNOTSUPP; - extack = switchdev_notifier_info_to_extack(&port_attr_info->info); + extack = switchdev_notifier_info_to_extack(info); if (check_cb(dev)) { - err = set_cb(dev, port_attr_info->attr, extack); + err = set_cb(dev, info->ctx, port_attr_info->attr, extack); if (err != -EOPNOTSUPP) port_attr_info->handled = true; return err; @@ -533,7 +796,7 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev, int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, bool (*check_cb)(const struct net_device *dev), - int (*set_cb)(struct net_device *dev, + int (*set_cb)(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack)) { @@ -546,3 +809,51 @@ int switchdev_handle_port_attr_set(struct net_device *dev, return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set); + +int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + .tx_fwd_offload = tx_fwd_offload, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload); + +void switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + + ASSERT_RTNL(); + + call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED, + brport_dev, &brport_info.info, + NULL); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index d4beca895992..593846d25214 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -699,7 +699,7 @@ int tipc_bcast_init(struct net *net) spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, NULL, - FB_MTU, + one_page_mtu, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, 0, diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index e5c43d4d5a75..c9391d38de85 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -898,16 +898,10 @@ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, if (unlikely(!aead)) return -ENOKEY; - /* Cow skb data if needed */ - if (likely(!skb_cloned(skb) && - (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { - nsg = 1 + skb_shinfo(skb)->nr_frags; - } else { - nsg = skb_cow_data(skb, 0, &unused); - if (unlikely(nsg < 0)) { - pr_err("RX: skb_cow_data() returned %d\n", nsg); - return nsg; - } + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; } /* Allocate memory for the AEAD operation */ diff --git a/net/tipc/link.c b/net/tipc/link.c index c44b4bfaaee6..1b7a487c8841 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -654,6 +654,7 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt) break; case LINK_FAILOVER_BEGIN_EVT: l->state = LINK_FAILINGOVER; + break; case LINK_FAILURE_EVT: case LINK_RESET_EVT: case LINK_ESTABLISH_EVT: diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ce6ab54822d8..5c9fd4791c4b 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -41,19 +41,18 @@ #include "name_table.h" #include "crypto.h" +#define BUF_ALIGN(x) ALIGN(x, 4) #define MAX_FORWARD_SIZE 1024 #ifdef CONFIG_TIPC_CRYPTO #define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) -#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE) +#define BUF_OVERHEAD (BUF_HEADROOM + TIPC_AES_GCM_TAG_SIZE) #else #define BUF_HEADROOM (LL_MAX_HEADER + 48) -#define BUF_TAILROOM 16 +#define BUF_OVERHEAD BUF_HEADROOM #endif -static unsigned int align(unsigned int i) -{ - return (i + 3) & ~3u; -} +const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); /** * tipc_buf_acquire - creates a TIPC message buffer @@ -69,13 +68,8 @@ static unsigned int align(unsigned int i) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; -#ifdef CONFIG_TIPC_CRYPTO - unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u; -#else - unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; -#endif - skb = alloc_skb_fclone(buf_size, gfp); + skb = alloc_skb_fclone(BUF_OVERHEAD + size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); @@ -395,7 +389,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, if (unlikely(!skb)) { if (pktmax != MAX_MSG_SIZE) return -ENOMEM; - rc = tipc_msg_build(mhdr, m, offset, dsz, FB_MTU, list); + rc = tipc_msg_build(mhdr, m, offset, dsz, + one_page_mtu, list); if (rc != dsz) return rc; if (tipc_msg_assemble(list)) @@ -490,7 +485,7 @@ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, msz = msg_size(msg); bsz = msg_size(bmsg); - offset = align(bsz); + offset = BUF_ALIGN(bsz); pad = offset - bsz; if (unlikely(skb_tailroom(bskb) < (pad + msz))) @@ -547,7 +542,7 @@ bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, /* Make a new bundle of the two messages if possible */ tsz = msg_size(buf_msg(tskb)); - if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg))) + if (unlikely(mss < BUF_ALIGN(INT_H_SIZE + tsz) + msg_size(msg))) return true; if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, GFP_ATOMIC))) @@ -606,7 +601,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) if (unlikely(!tipc_msg_validate(iskb))) goto none; - *pos += align(imsz); + *pos += BUF_ALIGN(imsz); return true; none: kfree_skb(skb); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 5d64596ba987..64ae4c4c44f8 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -99,9 +99,10 @@ struct plist; #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) -#define FB_MTU 3744 #define TIPC_MEDIA_INFO_OFFSET 5 +extern const int one_page_mtu; + struct tipc_skb_cb { union { struct { diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index fecab516bf41..01396dd1c899 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -673,12 +673,12 @@ exit: * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports) + struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; - u32 scope = ua->scope; + u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); @@ -688,7 +688,7 @@ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { - if (p->scope == scope || (!exact && p->scope < scope)) + if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index c7c9a3ddd420..259f95e3d99c 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -51,6 +51,8 @@ struct tipc_uaddr; #define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1) #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ +#define TIPC_ANY_SCOPE 10 /* Both node and cluster scope will match */ + /** * struct publication - info about a published service address or range * @sr: service range represented by this publication @@ -113,7 +115,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk); void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, - bool exact, struct list_head *dports); + struct list_head *dports); void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes); bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, diff --git a/net/tipc/node.c b/net/tipc/node.c index 81af92954c6c..9947b7dfe1d2 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1214,7 +1214,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. - * It may also be an cloned or malicious peer having + * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 53af72824c9c..a0a27d87f631 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -73,9 +73,6 @@ struct sockaddr_pair { /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port * @maxnagle: maximum size of msg which can be subject to nagle * @portid: unique port identity in TIPC socket hash table @@ -106,11 +103,11 @@ struct sockaddr_pair { * @expect_ack: whether this TIPC socket is expecting an ack * @nodelay: setsockopt() TIPC_NODELAY setting * @group_is_open: TIPC socket group is fully open (FIXME) + * @published: true if port has one or more associated names + * @conn_addrtype: address type used when establishing connection */ struct tipc_sock { struct sock sk; - u32 conn_type; - u32 conn_instance; u32 max_pkt; u32 maxnagle; u32 portid; @@ -141,6 +138,7 @@ struct tipc_sock { bool nodelay; bool group_is_open; bool published; + u8 conn_addrtype; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -160,6 +158,7 @@ static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack); +static int tipc_wait_for_connect(struct socket *sock, long *timeo_p); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -664,7 +663,7 @@ static int tipc_release(struct socket *sock) * @skaddr: socket address describing name(s) and desired operation * @alen: size of socket address data structure * - * Name and name sequence binding is indicated using a positive scope value; + * Name and name sequence binding are indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * @@ -1202,12 +1201,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct tipc_msg *hdr; struct tipc_uaddr ua; int user, mtyp, hlen; - bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); ua.addrtype = TIPC_SERVICE_RANGE; + /* tipc_skb_peek() increments the head skb's reference counter */ skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { hdr = buf_msg(skb); @@ -1216,6 +1215,12 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, hlen = skb_headroom(skb) + msg_hdr_sz(hdr); onode = msg_orignode(hdr); ua.sr.type = msg_nametype(hdr); + ua.sr.lower = msg_namelower(hdr); + ua.sr.upper = msg_nameupper(hdr); + if (onode == self) + ua.scope = TIPC_ANY_SCOPE; + else + ua.scope = TIPC_CLUSTER_SCOPE; if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); @@ -1233,20 +1238,10 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, ua.sr.lower = 0; ua.sr.upper = ~0; ua.scope = msg_lookup_scope(hdr); - exact = true; - } else { - /* TIPC_NODE_SCOPE means "any scope" in this context */ - if (onode == self) - ua.scope = TIPC_NODE_SCOPE; - else - ua.scope = TIPC_CLUSTER_SCOPE; - exact = false; - ua.sr.lower = msg_namelower(hdr); - ua.sr.upper = msg_nameupper(hdr); } /* Create destination port list: */ - tipc_nametbl_lookup_mcast_sockets(net, &ua, exact, &dports); + tipc_nametbl_lookup_mcast_sockets(net, &ua, &dports); /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { @@ -1258,13 +1253,11 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } pr_warn("Failed to clone mcast rcv buffer\n"); } - /* Append to inputq if not already done by other thread */ + /* Append clones to inputq only if skb is still head of arrvq */ spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { skb_queue_splice_tail_init(&tmpq, inputq); - /* Decrease the skb's refcnt as increasing in the - * function tipc_skb_peek - */ + /* Decrement the skb's refcnt */ kfree_skb(__skb_dequeue(arrvq)); } spin_unlock_bh(&inputq->lock); @@ -1433,7 +1426,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (ua) { if (!tipc_uaddr_valid(ua, m->msg_namelen)) return -EINVAL; - atype = ua->addrtype; + atype = ua->addrtype; } /* If socket belongs to a communication group follow other paths */ @@ -1463,10 +1456,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (atype == TIPC_SERVICE_ADDR) { - tsk->conn_type = ua->sa.type; - tsk->conn_instance = ua->sa.instance; - } + if (atype == TIPC_SERVICE_ADDR) + tsk->conn_addrtype = atype; msg_set_syn(hdr, 1); } @@ -1525,8 +1516,13 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = 0; } - if (unlikely(syn && !rc)) + if (unlikely(syn && !rc)) { tipc_set_sk_state(sk, TIPC_CONNECTING); + if (dlen && timeout) { + timeout = msecs_to_jiffies(timeout); + tipc_wait_for_connect(sock, &timeout); + } + } return rc ? rc : dlen; } @@ -1574,7 +1570,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) return -EMSGSIZE; /* Handle implicit connection setup */ - if (unlikely(dest)) { + if (unlikely(dest && sk->sk_state == TIPC_OPEN)) { rc = __tipc_sendmsg(sock, m, dlen); if (dlen && dlen == rc) { tsk->peer_caps = tipc_node_get_capabilities(net, dnode); @@ -1737,67 +1733,58 @@ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { - struct tipc_msg *msg; - u32 anc_data[3]; - u32 err; - u32 dest_type; - int has_name; - int res; + struct tipc_msg *hdr; + u32 data[3] = {0,}; + bool has_addr; + int dlen, rc; if (likely(m->msg_controllen == 0)) return 0; - msg = buf_msg(skb); - /* Optionally capture errored message object(s) */ - err = msg ? msg_errcode(msg) : 0; - if (unlikely(err)) { - anc_data[0] = err; - anc_data[1] = msg_data_sz(msg); - res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data); - if (res) - return res; - if (anc_data[1]) { - if (skb_linearize(skb)) - return -ENOMEM; - msg = buf_msg(skb); - res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], - msg_data(msg)); - if (res) - return res; - } + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + + /* Capture errored message object, if any */ + if (msg_errcode(hdr)) { + if (skb_linearize(skb)) + return -ENOMEM; + hdr = buf_msg(skb); + data[0] = msg_errcode(hdr); + data[1] = dlen; + rc = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, data); + if (rc || !dlen) + return rc; + rc = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, dlen, msg_data(hdr)); + if (rc) + return rc; } - /* Optionally capture message destination object */ - dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; - switch (dest_type) { + /* Capture TIPC_SERVICE_ADDR/RANGE destination address, if any */ + switch (msg_type(hdr)) { case TIPC_NAMED_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_namelower(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = data[1]; break; case TIPC_MCAST_MSG: - has_name = 1; - anc_data[0] = msg_nametype(msg); - anc_data[1] = msg_namelower(msg); - anc_data[2] = msg_nameupper(msg); + has_addr = true; + data[0] = msg_nametype(hdr); + data[1] = msg_namelower(hdr); + data[2] = msg_nameupper(hdr); break; case TIPC_CONN_MSG: - has_name = (tsk->conn_type != 0); - anc_data[0] = tsk->conn_type; - anc_data[1] = tsk->conn_instance; - anc_data[2] = tsk->conn_instance; + has_addr = !!tsk->conn_addrtype; + data[0] = msg_nametype(&tsk->phdr); + data[1] = msg_nameinst(&tsk->phdr); + data[2] = data[1]; break; default: - has_name = 0; - } - if (has_name) { - res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data); - if (res) - return res; + has_addr = false; } - - return 0; + if (!has_addr) + return 0; + return put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, data); } static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk) @@ -1899,6 +1886,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_skb_cb *skb_cb; struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; @@ -1922,6 +1910,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); @@ -1941,18 +1930,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { - copy = min_t(int, dlen, buflen); - if (unlikely(copy != dlen)) - m->msg_flags |= MSG_TRUNC; - rc = skb_copy_datagram_msg(skb, hlen, m, copy); + int offset = skb_cb->bytes_read; + + copy = min_t(int, dlen - offset, buflen); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + goto exit; + if (unlikely(offset + copy < dlen)) { + if (flags & MSG_EOR) { + if (!(flags & MSG_PEEK)) + skb_cb->bytes_read = offset + copy; + } else { + m->msg_flags |= MSG_TRUNC; + skb_cb->bytes_read = 0; + } + } else { + if (flags & MSG_EOR) + m->msg_flags |= MSG_EOR; + skb_cb->bytes_read = 0; + } } else { copy = 0; rc = 0; - if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) { rc = -ECONNRESET; + goto exit; + } } - if (unlikely(rc)) - goto exit; /* Mark message as group event if applicable */ if (unlikely(grp_evt)) { @@ -1975,6 +1979,9 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, tipc_node_distr_xmit(sock_net(sk), &xmitq); } + if (skb_cb->bytes_read) + goto exit; + tsk_advance_rx_queue(sk); if (likely(!connected)) @@ -2665,7 +2672,7 @@ static int tipc_listen(struct socket *sock, int len) static int tipc_wait_for_accept(struct socket *sock, long timeo) { struct sock *sk = sock->sk; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; /* True wake-one mechanism for incoming connections: only @@ -2674,12 +2681,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * anymore, the common case will execute the loop only once. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { + add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); + remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -2691,7 +2698,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) if (signal_pending(current)) break; } - finish_wait(sk_sleep(sk), &wait); return err; } @@ -2708,9 +2714,10 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *new_sk, *sk = sock->sk; - struct sk_buff *buf; struct tipc_sock *new_tsock; + struct msghdr m = {NULL,}; struct tipc_msg *msg; + struct sk_buff *buf; long timeo; int res; @@ -2750,24 +2757,23 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { - new_tsock->conn_type = msg_nametype(msg); - new_tsock->conn_instance = msg_nameinst(msg); + new_tsock->conn_addrtype = TIPC_SERVICE_ADDR; + msg_set_nametype(&new_tsock->phdr, msg_nametype(msg)); + msg_set_nameinst(&new_tsock->phdr, msg_nameinst(msg)); } /* - * Respond to 'SYN-' by discarding it & returning 'ACK'-. - * Respond to 'SYN+' by queuing it on new socket. + * Respond to 'SYN-' by discarding it & returning 'ACK'. + * Respond to 'SYN+' by queuing it on new socket & returning 'ACK'. */ if (!msg_data_sz(msg)) { - struct msghdr m = {NULL,}; - tsk_advance_rx_queue(sk); - __tipc_sendstream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: release_sock(sk); @@ -3455,13 +3461,14 @@ void tipc_socket_stop(void) /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) { - u32 peer_node; - u32 peer_port; + u32 peer_node, peer_port; + u32 conn_type, conn_instance; struct nlattr *nest; peer_node = tsk_peer_node(tsk); peer_port = tsk_peer_port(tsk); - + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); nest = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_CON); if (!nest) return -EMSGSIZE; @@ -3471,12 +3478,12 @@ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port)) goto msg_full; - if (tsk->conn_type != 0) { + if (tsk->conn_addrtype != 0) { if (nla_put_flag(skb, TIPC_NLA_CON_FLAG)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type)) + if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, conn_type)) goto msg_full; - if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance)) + if (nla_put_u32(skb, TIPC_NLA_CON_INST, conn_instance)) goto msg_full; } nla_nest_end(skb, nest); @@ -3866,9 +3873,9 @@ bool tipc_sk_filtering(struct sock *sk) } if (!tipc_sk_type_connectionless(sk)) { - type = tsk->conn_type; - lower = tsk->conn_instance; - upper = tsk->conn_instance; + type = msg_nametype(&tsk->phdr); + lower = msg_nameinst(&tsk->phdr); + upper = lower; } if ((_type && _type != type) || (_lower && _lower != lower) || @@ -3933,6 +3940,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) { int i = 0; size_t sz = (dqueues) ? SK_LMAX : SK_LMIN; + u32 conn_type, conn_instance; struct tipc_sock *tsk; struct publication *p; bool tsk_connected; @@ -3953,8 +3961,10 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) if (tsk_connected) { i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk)); i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk)); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_type); - i += scnprintf(buf + i, sz - i, " %u", tsk->conn_instance); + conn_type = msg_nametype(&tsk->phdr); + conn_instance = msg_nameinst(&tsk->phdr); + i += scnprintf(buf + i, sz - i, " %u", conn_type); + i += scnprintf(buf + i, sz - i, " %u", conn_instance); } i += scnprintf(buf + i, sz - i, " | %u", tsk->published); if (tsk->published) { diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 8e00d739f03a..05d49ad81290 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -66,7 +66,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for - * @found: the service range we are checning for match + * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bd9f1567aa39..b932469ee69c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i; for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 694de024d0ee..4feb95e34b64 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1153,7 +1153,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); + eor = !(flags & MSG_SENDPAGE_NOTLAST); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Call the sk_stream functions to manage the sndbuf mem. */ @@ -2019,8 +2019,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_read_end; - if (likely(!(flags & MSG_PEEK))) - tls_sw_advance_skb(sk, skb, copied); + tls_sw_advance_skb(sk, skb, copied); splice_read_end: release_sock(sk); diff --git a/net/unix/Kconfig b/net/unix/Kconfig index b6c4282899ec..b7f811216820 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -25,6 +25,11 @@ config UNIX_SCM depends on UNIX default y +config AF_UNIX_OOB + bool + depends on UNIX + default y + config UNIX_DIAG tristate "UNIX: socket monitoring interface" depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 54e58cc4f945..20491825b4d0 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX) += unix.o unix-y := af_unix.o garbage.o unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o +unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5d1192ceb139..eb47b9de2380 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -113,6 +113,7 @@ #include <linux/security.h> #include <linux/freezer.h> #include <linux/file.h> +#include <linux/btf_ids.h> #include "scm.h" @@ -262,6 +263,14 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) sk_add_node(sk, list); } +static void __unix_set_addr(struct sock *sk, struct unix_address *addr, + unsigned hash) +{ + __unix_remove_socket(sk); + smp_store_release(&unix_sk(sk)->addr, addr); + __unix_insert_socket(&unix_socket_table[hash], sk); +} + static inline void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); @@ -278,11 +287,11 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, unsigned int hash) + int len, unsigned int hash) { struct sock *s; - sk_for_each(s, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -297,13 +306,12 @@ static struct sock *__unix_find_socket_byname(struct net *net, static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, - unsigned int hash) + int len, unsigned int hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(net, sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); @@ -484,9 +492,10 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { other->sk_err = ECONNRESET; - other->sk_error_report(other); + sk_error_report(other); } } + other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -495,6 +504,12 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); @@ -662,6 +677,10 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -715,6 +734,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, + .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -739,6 +759,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, + .read_sock = unix_read_sock, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -770,13 +791,42 @@ static const struct proto_ops unix_seqpacket_ops = { .show_fdinfo = unix_show_fdinfo, }; -static struct proto unix_proto = { - .name = "UNIX", +static void unix_close(struct sock *sk, long timeout) +{ + /* Nothing to do here, unix socket does not need a ->close(). + * This is merely for sockmap. + */ +} + +static void unix_unhash(struct sock *sk) +{ + /* Nothing to do here, unix socket does not need a ->unhash(). + * This is merely for sockmap. + */ +} + +struct proto unix_dgram_proto = { + .name = "UNIX-DGRAM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_dgram_bpf_update_proto, +#endif +}; + +struct proto unix_stream_proto = { + .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), + .close = unix_close, + .unhash = unix_unhash, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_stream_bpf_update_proto, +#endif }; -static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct sock *sk = NULL; struct unix_sock *u; @@ -785,7 +835,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); + if (type == SOCK_STREAM) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); + else /*dgram and seqpacket */ + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); + if (!sk) goto out; @@ -847,7 +901,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -857,6 +911,7 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; @@ -891,12 +946,12 @@ static int unix_autobind(struct socket *sock) retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, - addr->hash)) { + if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -911,11 +966,8 @@ retry: } goto retry; } - addr->hash ^= sk->sk_type; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(&unix_socket_table[addr->hash], sk); + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); err = 0; @@ -960,7 +1012,7 @@ static struct sock *unix_find_other(struct net *net, } } else { err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type, hash); + u = unix_find_socket_byname(net, sunname, len, type ^ hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->path.dentry; @@ -978,125 +1030,125 @@ fail: return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { + struct unix_sock *u = unix_sk(sk); + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + struct user_namespace *ns; // barf... + struct path parent; struct dentry *dentry; - struct path path; - int err = 0; + unsigned int hash; + int err; + /* * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); + dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) - return err; + return PTR_ERR(dentry); + ns = mnt_user_ns(parent.mnt); /* * All right, let's create it. */ - err = security_path_mknod(&path, dentry, mode, 0); - if (!err) { - err = vfs_mknod(mnt_user_ns(path.mnt), d_inode(path.dentry), - dentry, mode, 0); - if (!err) { - res->mnt = mntget(path.mnt); - res->dentry = dget(dentry); - } - } - done_path_create(&path, dentry); + err = security_path_mknod(&parent, dentry, mode, 0); + if (!err) + err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); + if (err) + goto out; + err = mutex_lock_interruptible(&u->bindlock); + if (err) + goto out_unlink; + if (u->addr) + goto out_unlock; + + addr->hash = UNIX_HASH_SIZE; + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); + spin_lock(&unix_table_lock); + u->path.mnt = mntget(parent.mnt); + u->path.dentry = dget(dentry); + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + done_path_create(&parent, dentry); + return 0; + +out_unlock: + mutex_unlock(&u->bindlock); + err = -EINVAL; +out_unlink: + /* failed after successful mknod? unlink what we'd created... */ + vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); +out: + done_path_create(&parent, dentry); return err; } +static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + int err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) + return err; + + if (u->addr) { + mutex_unlock(&u->bindlock); + return -EINVAL; + } + + spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + addr->hash)) { + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return -EADDRINUSE; + } + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; int err; unsigned int hash; struct unix_address *addr; - struct hlist_head *list; - struct path path = { }; - err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || sunaddr->sun_family != AF_UNIX) - goto out; + return -EINVAL; - if (addr_len == sizeof(short)) { - err = unix_autobind(sock); - goto out; - } + if (addr_len == sizeof(short)) + return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) - goto out; + return err; addr_len = err; - - if (sun_path[0]) { - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - goto out; - } - } - - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_put; - - err = -EINVAL; - if (u->addr) - goto out_up; - - err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - goto out_up; + return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { - addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); - spin_lock(&unix_table_lock); - u->path = path; - list = &unix_socket_table[hash]; - } else { - spin_lock(&unix_table_lock); - err = -EADDRINUSE; - if (__unix_find_socket_byname(net, sunaddr, addr_len, - sk->sk_type, hash)) { - unix_release_addr(addr); - goto out_unlock; - } - - list = &unix_socket_table[addr->hash]; - } - - err = 0; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(list, sk); - -out_unlock: - spin_unlock(&unix_table_lock); -out_up: - mutex_unlock(&u->bindlock); -out_put: + if (sun_path[0]) + err = unix_bind_bsd(sk, addr); + else + err = unix_bind_abstract(sk, addr); if (err) - path_put(&path); -out: - return err; + unix_release_addr(addr); + return err == -EEXIST ? -EADDRINUSE : err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) @@ -1170,6 +1222,7 @@ restart: if (err) goto out_unlock; + sk->sk_state = other->sk_state = TCP_ESTABLISHED; } else { /* * 1003.1g breaking connected state with AF_UNSPEC @@ -1183,7 +1236,10 @@ restart: */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); + unix_peer(sk) = other; + if (!other) + sk->sk_state = TCP_CLOSE; unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); @@ -1195,6 +1251,7 @@ restart: unix_peer(sk) = other; unix_state_double_unlock(sk, other); } + return 0; out_unlock: @@ -1260,7 +1317,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0); + newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); if (newsk == NULL) goto out; @@ -1393,7 +1450,7 @@ restart: unix_state_unlock(sk); - /* take ten and and send info to listening sock */ + /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -1427,12 +1484,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) init_peercred(ska); init_peercred(skb); - if (ska->sk_type != SOCK_DGRAM) { - ska->sk_state = TCP_ESTABLISHED; - skb->sk_state = TCP_ESTABLISHED; - socka->state = SS_CONNECTED; - sockb->state = SS_CONNECTED; - } + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; return 0; } @@ -1522,6 +1577,53 @@ out: return err; } +static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* + * Garbage collection of unix sockets starts by selecting a set of + * candidate sockets which have reference only from being in flight + * (total_refs == inflight_refs). This condition is checked once during + * the candidate collection phase, and candidates are marked as such, so + * that non-candidates can later be ignored. While inflight_refs is + * protected by unix_gc_lock, total_refs (file count) is not, hence this + * is an instantaneous decision. + * + * Once a candidate, however, the socket must not be reinstalled into a + * file descriptor while the garbage collection is in progress. + * + * If the above conditions are met, then the directed graph of + * candidates (*) does not change while unix_gc_lock is held. + * + * Any operations that changes the file count through file descriptors + * (dup, close, sendmsg) does not change the graph since candidates are + * not installed in fds. + * + * Dequeing a candidate via recvmsg would install it into an fd, but + * that takes unix_gc_lock to decrement the inflight count, so it's + * serialized with garbage collection. + * + * MSG_PEEK is special in that it does not change the inflight count, + * yet does install the socket into an fd. The following lock/unlock + * pair is to ensure serialization with garbage collection. It must be + * done between incrementing the file count and installing the file into + * an fd. + * + * If garbage collection starts after the barrier provided by the + * lock/unlock, then it will see the elevated refcount and not mark this + * as a candidate. If a garbage collection is already in progress + * before the file count was incremented, then the lock/unlock pair will + * ensure that garbage collection is finished before progressing to + * installing the fd. + * + * (*) A -> B where B is on the queue of A or B is on the queue of C + * which is on the queue of listening socket A. + */ + spin_lock(&unix_gc_lock); + spin_unlock(&unix_gc_lock); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; @@ -1731,6 +1833,7 @@ restart_locked: unix_state_unlock(sk); + sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; @@ -1821,6 +1924,53 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other) +{ + struct unix_sock *ousk = unix_sk(other); + struct sk_buff *skb; + int err = 0; + + skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + + if (!skb) + return err; + + skb_put(skb, 1); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); + + if (err) { + kfree_skb(skb); + return err; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + (other->sk_shutdown & RCV_SHUTDOWN)) { + unix_state_unlock(other); + kfree_skb(skb); + return -EPIPE; + } + + maybe_add_creds(skb, sock, other); + skb_get(skb); + + if (ousk->oob_skb) + consume_skb(ousk->oob_skb); + + ousk->oob_skb = skb; + + scm_stat_add(other, skb); + skb_queue_tail(&other->sk_receive_queue, skb); + sk_send_sigurg(other); + unix_state_unlock(other); + other->sk_data_ready(other); + + return err; +} +#endif + static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1839,8 +1989,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return err; err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) - goto out_err; + if (msg->msg_flags & MSG_OOB) { +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (len) + len--; + else +#endif + goto out_err; + } if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; @@ -1905,6 +2061,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, sent += size; } +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (msg->msg_flags & MSG_OOB) { + err = queue_oob(sock, msg, other); + if (err) + goto out_err; + sent++; + } +#endif + scm_destroy(&scm); return sent; @@ -2077,11 +2242,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk) } } -static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags) { struct scm_cookie scm; - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; @@ -2171,7 +2336,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; @@ -2184,6 +2349,55 @@ out: return err; } +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + +#ifdef CONFIG_BPF_SYSCALL + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_dgram_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif + return __unix_dgram_recvmsg(sk, msg, size, flags); +} + +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int used, err; + + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, 0, 1, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; + + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + kfree_skb(skb); + break; + } else if (used <= skb->len) { + copied += used; + } + + kfree_skb(skb); + if (!desc->count) + break; + } + + return copied; +} + /* * Sleep until more data has arrived. But check for races.. */ @@ -2243,6 +2457,86 @@ struct unix_stream_read_state { unsigned int splice_flags; }; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +static int unix_stream_recv_urg(struct unix_stream_read_state *state) +{ + struct socket *sock = state->socket; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + int chunk = 1; + struct sk_buff *oob_skb; + + mutex_lock(&u->iolock); + unix_state_lock(sk); + + if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { + unix_state_unlock(sk); + mutex_unlock(&u->iolock); + return -EINVAL; + } + + oob_skb = u->oob_skb; + + if (!(state->flags & MSG_PEEK)) { + u->oob_skb = NULL; + } + + unix_state_unlock(sk); + + chunk = state->recv_actor(oob_skb, 0, chunk, state); + + if (!(state->flags & MSG_PEEK)) { + UNIXCB(oob_skb).consumed += 1; + kfree_skb(oob_skb); + } + + mutex_unlock(&u->iolock); + + if (chunk < 0) + return -EFAULT; + + state->msg->msg_flags |= MSG_OOB; + return 1; +} + +static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, + int flags, int copied) +{ + struct unix_sock *u = unix_sk(sk); + + if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = NULL; + } else { + if (skb == u->oob_skb) { + if (copied) { + skb = NULL; + } else if (sock_flag(sk, SOCK_URGINLINE)) { + if (!(flags & MSG_PEEK)) { + u->oob_skb = NULL; + consume_skb(skb); + } + } else if (!(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = skb_peek(&sk->sk_receive_queue); + } + } + } + return skb; +} +#endif + +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + return -ENOTCONN; + + return unix_read_sock(sk, desc, recv_actor); +} + static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { @@ -2268,6 +2562,9 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + err = unix_stream_recv_urg(state); +#endif goto out; } @@ -2296,6 +2593,18 @@ redo: } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb) { + skb = manage_oob(skb, sk, flags, copied); + if (!skb) { + unix_state_unlock(sk); + if (copied) + break; + goto redo; + } + } +#endif again: if (skb == NULL) { if (copied >= target) @@ -2414,7 +2723,7 @@ unlock: /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); @@ -2453,6 +2762,20 @@ static int unix_stream_read_actor(struct sk_buff *skb, return ret ?: chunk; } +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sk->sk_socket, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -2464,6 +2787,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_stream_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif return unix_stream_read_generic(&state, true); } @@ -2524,7 +2855,10 @@ static int unix_shutdown(struct socket *sock, int mode) (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; + const struct proto *prot = READ_ONCE(other->sk_prot); + if (prot->unhash) + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) @@ -2533,10 +2867,12 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) + if (peer_mode == SHUTDOWN_MASK) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - else if (peer_mode & RCV_SHUTDOWN) + other->sk_state = TCP_CLOSE; + } else if (peer_mode & RCV_SHUTDOWN) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); + } } if (other) sock_put(other); @@ -2631,6 +2967,20 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCUNIXFILE: err = unix_open_file(sk); break; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + case SIOCATMARK: + { + struct sk_buff *skb; + struct unix_sock *u = unix_sk(sk); + int answ = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb && skb == u->oob_skb) + answ = 1; + err = put_user(answ, (int __user *)arg); + } + break; +#endif default: err = -ENOIOCTLCMD; break; @@ -2867,6 +3217,64 @@ static const struct seq_operations unix_seq_ops = { .stop = unix_seq_stop, .show = unix_seq_show, }; + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__unix { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct unix_sock *, unix_sk); + uid_t uid __aligned(8); +}; + +static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) +{ + struct bpf_iter__unix ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.unix_sk = unix_sk; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return unix_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)unix_prog_seq_show(prog, &meta, v, 0); + } + + unix_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = bpf_iter_unix_seq_stop, + .show = bpf_iter_unix_seq_show, +}; +#endif #endif static const struct net_proto_family unix_family_ops = { @@ -2907,13 +3315,48 @@ static struct pernet_operations unix_net_ops = { .exit = unix_net_exit, }; +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) + +static const struct bpf_iter_seq_info unix_seq_info = { + .seq_ops = &bpf_iter_unix_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct seq_net_private), +}; + +static struct bpf_iter_reg unix_reg_info = { + .target = "unix", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__unix, unix_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &unix_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; + if (bpf_iter_reg_target(&unix_reg_info)) + pr_warn("Warning: could not register bpf iterator unix\n"); +} +#endif + static int __init af_unix_init(void) { int rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - rc = proto_register(&unix_proto, 1); + rc = proto_register(&unix_dgram_proto, 1); + if (rc != 0) { + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + goto out; + } + + rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; @@ -2921,6 +3364,12 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); + unix_bpf_build_proto(); + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif + out: return rc; } @@ -2928,7 +3377,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - proto_unregister(&unix_proto); + proto_unregister(&unix_dgram_proto); + proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } diff --git a/net/unix/diag.c b/net/unix/diag.c index 9ff64f9df1f3..7e7d7f45685a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -295,10 +295,8 @@ again: goto again; } - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + out: if (sk) sock_put(sk); diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c new file mode 100644 index 000000000000..b927e2baae50 --- /dev/null +++ b/net/unix/unix_bpf.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ + +#include <linux/skmsg.h> +#include <linux/bpf.h> +#include <net/sock.h> +#include <net/af_unix.h> + +#define unix_sk_has_data(__sk, __psock) \ + ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ + !skb_queue_empty(&__psock->ingress_skb) || \ + !list_empty(&__psock->ingress_msg); \ + }) + +static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct unix_sock *u = unix_sk(sk); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + if (!unix_sk_has_data(sk, psock)) { + mutex_unlock(&u->iolock); + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + mutex_lock(&u->iolock); + ret = unix_sk_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + if (sk->sk_type == SOCK_DGRAM) + return __unix_dgram_recvmsg(sk, msg, len, flags); + else + return __unix_stream_recvmsg(sk, msg, len, flags); +} + +static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, + int *addr_len) +{ + struct unix_sock *u = unix_sk(sk); + struct sk_psock *psock; + int copied; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return __unix_recvmsg(sk, msg, len, flags); + + mutex_lock(&u->iolock); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_recvmsg(sk, msg, len, flags); + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = unix_msg_wait_data(sk, psock, timeo); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_recvmsg(sk, msg, len, flags); + } + copied = -EAGAIN; + } + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return copied; +} + +static struct proto *unix_dgram_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_dgram_prot_lock); +static struct proto unix_dgram_bpf_prot; + +static struct proto *unix_stream_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_stream_prot_lock); +static struct proto unix_stream_bpf_prot; + +static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; +} + +static void unix_stream_bpf_rebuild_protos(struct proto *prot, + const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; + prot->unhash = sock_map_unhash; +} + +static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { + spin_lock_bh(&unix_dgram_prot_lock); + if (likely(ops != unix_dgram_prot_saved)) { + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); + smp_store_release(&unix_dgram_prot_saved, ops); + } + spin_unlock_bh(&unix_dgram_prot_lock); + } +} + +static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { + spin_lock_bh(&unix_stream_prot_lock); + if (likely(ops != unix_stream_prot_saved)) { + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); + smp_store_release(&unix_stream_prot_saved, ops); + } + spin_unlock_bh(&unix_stream_prot_lock); + } +} + +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (sk->sk_type != SOCK_DGRAM) + return -EOPNOTSUPP; + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); + return 0; +} + +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); + return 0; +} + +void __init unix_bpf_build_proto(void) +{ + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); + +} diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 92a72f0e0d94..e2c0cfb334d2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -415,8 +415,8 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) /* Assign a transport to a socket and call the .init transport callback. * - * Note: for stream socket this must be called when vsk->remote_addr is set - * (e.g. during the connect() or when a connection request on a listener + * Note: for connection oriented socket this must be called when vsk->remote_addr + * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if @@ -452,6 +452,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_dgram; break; case SOCK_STREAM: + case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || @@ -469,10 +470,10 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) return 0; /* transport->release() must be called with sock lock acquired. - * This path can only be taken during vsock_stream_connect(), - * where we have already held the sock lock. - * In the other cases, this function is called on a new socket - * which is not assigned to any transport. + * This path can only be taken during vsock_connect(), where we + * have already held the sock lock. In the other cases, this + * function is called on a new socket which is not assigned to + * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); @@ -484,6 +485,14 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (!new_transport || !try_module_get(new_transport->module)) return -ENODEV; + if (sk->sk_type == SOCK_SEQPACKET) { + if (!new_transport->seqpacket_allow || + !new_transport->seqpacket_allow(remote_cid)) { + module_put(new_transport->module); + return -ESOCKTNOSUPPORT; + } + } + ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); @@ -604,8 +613,8 @@ out: /**** SOCKET OPERATIONS ****/ -static int __vsock_bind_stream(struct vsock_sock *vsk, - struct sockaddr_vm *addr) +static int __vsock_bind_connectible(struct vsock_sock *vsk, + struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; @@ -649,9 +658,10 @@ static int __vsock_bind_stream(struct vsock_sock *vsk, vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); - /* Remove stream sockets from the unbound list and add them to the hash - * table for easy lookup by its address. The unbound list is simply an - * extra entry at the end of the hash table, a trick used by AF_UNIX. + /* Remove connection oriented sockets from the unbound list and add them + * to the hash table for easy lookup by its address. The unbound list + * is simply an extra entry at the end of the hash table, a trick used + * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); @@ -684,8 +694,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) switch (sk->sk_socket->type) { case SOCK_STREAM: + case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); - retval = __vsock_bind_stream(vsk, addr); + retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; @@ -768,6 +779,11 @@ static struct sock *__vsock_create(struct net *net, return sk; } +static bool sock_type_connectible(u16 type) +{ + return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); +} + static void __vsock_release(struct sock *sk, int level) { if (sk) { @@ -786,7 +802,7 @@ static void __vsock_release(struct sock *sk, int level) if (vsk->transport) vsk->transport->release(vsk); - else if (sk->sk_type == SOCK_STREAM) + else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); @@ -844,6 +860,16 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_stream_has_data); +static s64 vsock_connectible_has_data(struct vsock_sock *vsk) +{ + struct sock *sk = sk_vsock(vsk); + + if (sk->sk_type == SOCK_SEQPACKET) + return vsk->transport->seqpacket_has_data(vsk); + else + return vsock_stream_has_data(vsk); +} + s64 vsock_stream_has_space(struct vsock_sock *vsk) { return vsk->transport->stream_has_space(vsk); @@ -937,10 +963,10 @@ static int vsock_shutdown(struct socket *sock, int mode) if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; - /* If this is a STREAM socket and it is not connected then bail out - * immediately. If it is a DGRAM socket then we must first kick the - * socket so that it wakes up from any sleeping calls, for example - * recv(), and then afterwards return the error. + /* If this is a connection oriented socket and it is not connected then + * bail out immediately. If it is a DGRAM socket then we must first + * kick the socket so that it wakes up from any sleeping calls, for + * example recv(), and then afterwards return the error. */ sk = sock->sk; @@ -948,7 +974,7 @@ static int vsock_shutdown(struct socket *sock, int mode) lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; - if (sk->sk_type == SOCK_STREAM) + if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; @@ -961,7 +987,7 @@ static int vsock_shutdown(struct socket *sock, int mode) sk->sk_shutdown |= mode; sk->sk_state_change(sk); - if (sk->sk_type == SOCK_STREAM) { + if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } @@ -1016,7 +1042,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; - } else if (sock->type == SOCK_STREAM) { + } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); @@ -1255,7 +1281,7 @@ static void vsock_connect_timeout(struct work_struct *work) (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); @@ -1263,8 +1289,8 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, - int addr_len, int flags) +static int vsock_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) { int err; struct sock *sk; @@ -1369,7 +1395,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = TCP_CLOSE; + sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; @@ -1414,7 +1440,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(listener); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } @@ -1491,7 +1517,7 @@ static int vsock_listen(struct socket *sock, int backlog) lock_sock(sk); - if (sock->type != SOCK_STREAM) { + if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } @@ -1535,11 +1561,11 @@ static void vsock_update_buffer_size(struct vsock_sock *vsk, vsk->buffer_size = val; } -static int vsock_stream_setsockopt(struct socket *sock, - int level, - int optname, - sockptr_t optval, - unsigned int optlen) +static int vsock_connectible_setsockopt(struct socket *sock, + int level, + int optname, + sockptr_t optval, + unsigned int optlen) { int err; struct sock *sk; @@ -1617,10 +1643,10 @@ exit: return err; } -static int vsock_stream_getsockopt(struct socket *sock, - int level, int optname, - char __user *optval, - int __user *optlen) +static int vsock_connectible_getsockopt(struct socket *sock, + int level, int optname, + char __user *optval, + int __user *optlen) { int err; int len; @@ -1688,8 +1714,8 @@ static int vsock_stream_getsockopt(struct socket *sock, return 0; } -static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, - size_t len) +static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk; struct vsock_sock *vsk; @@ -1712,7 +1738,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, transport = vsk->transport; - /* Callers should not provide a destination with stream sockets. */ + /* Callers should not provide a destination with connection oriented + * sockets. + */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; @@ -1803,9 +1831,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, * responsibility to check how many bytes we were able to send. */ - written = transport->stream_enqueue( - vsk, msg, - len - total_written); + if (sk->sk_type == SOCK_SEQPACKET) { + written = transport->seqpacket_enqueue(vsk, + msg, len - total_written); + } else { + written = transport->stream_enqueue(vsk, + msg, len - total_written); + } if (written < 0) { err = -ENOMEM; goto out_err; @@ -1821,72 +1853,98 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, } out_err: - if (total_written > 0) - err = total_written; + if (total_written > 0) { + /* Return number of written bytes only if: + * 1) SOCK_STREAM socket. + * 2) SOCK_SEQPACKET socket when whole buffer is sent. + */ + if (sk->sk_type == SOCK_STREAM || total_written == len) + err = total_written; + } out: release_sock(sk); return err; } - -static int -vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +static int vsock_connectible_wait_data(struct sock *sk, + struct wait_queue_entry *wait, + long timeout, + struct vsock_transport_recv_notify_data *recv_data, + size_t target) { - struct sock *sk; - struct vsock_sock *vsk; const struct vsock_transport *transport; + struct vsock_sock *vsk; + s64 data; int err; - size_t target; - ssize_t copied; - long timeout; - struct vsock_transport_recv_notify_data recv_data; - DEFINE_WAIT(wait); - - sk = sock->sk; vsk = vsock_sk(sk); err = 0; + transport = vsk->transport; - lock_sock(sk); + while ((data = vsock_connectible_has_data(vsk)) == 0) { + prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); - transport = vsk->transport; + if (sk->sk_err != 0 || + (sk->sk_shutdown & RCV_SHUTDOWN) || + (vsk->peer_shutdown & SEND_SHUTDOWN)) { + break; + } - if (!transport || sk->sk_state != TCP_ESTABLISHED) { - /* Recvmsg is supposed to return 0 if a peer performs an - * orderly shutdown. Differentiate between that case and when a - * peer has not connected or a local shutdown occurred with the - * SOCK_DONE flag. - */ - if (sock_flag(sk, SOCK_DONE)) - err = 0; - else - err = -ENOTCONN; + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + break; + } - goto out; - } + if (recv_data) { + err = transport->notify_recv_pre_block(vsk, target, recv_data); + if (err < 0) + break; + } - if (flags & MSG_OOB) { - err = -EOPNOTSUPP; - goto out; - } + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); - /* We don't check peer_shutdown flag here since peer may actually shut - * down, but there can be data in the queue that a local socket can - * receive. - */ - if (sk->sk_shutdown & RCV_SHUTDOWN) { - err = 0; - goto out; + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + break; + } else if (timeout == 0) { + err = -EAGAIN; + break; + } } - /* It is valid on Linux to pass in a zero-length receive buffer. This - * is not an error. We may as well bail out now. + finish_wait(sk_sleep(sk), wait); + + if (err) + return err; + + /* Internal transport error when checking for available + * data. XXX This should be changed to a connection + * reset in a later change. */ - if (!len) { - err = 0; - goto out; - } + if (data < 0) + return -ENOMEM; + + return data; +} + +static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + struct vsock_transport_recv_notify_data recv_data; + const struct vsock_transport *transport; + struct vsock_sock *vsk; + ssize_t copied; + size_t target; + long timeout; + int err; + + DEFINE_WAIT(wait); + + vsk = vsock_sk(sk); + transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to @@ -1908,94 +1966,158 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, while (1) { - s64 ready; + ssize_t read; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - ready = vsock_stream_has_data(vsk); + err = vsock_connectible_wait_data(sk, &wait, timeout, + &recv_data, target); + if (err <= 0) + break; - if (ready == 0) { - if (sk->sk_err != 0 || - (sk->sk_shutdown & RCV_SHUTDOWN) || - (vsk->peer_shutdown & SEND_SHUTDOWN)) { - finish_wait(sk_sleep(sk), &wait); - break; - } - /* Don't wait for non-blocking sockets. */ - if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } + err = transport->notify_recv_pre_dequeue(vsk, target, + &recv_data); + if (err < 0) + break; - err = transport->notify_recv_pre_block( - vsk, target, &recv_data); - if (err < 0) { - finish_wait(sk_sleep(sk), &wait); - break; - } - release_sock(sk); - timeout = schedule_timeout(timeout); - lock_sock(sk); + read = transport->stream_dequeue(vsk, msg, len - copied, flags); + if (read < 0) { + err = -ENOMEM; + break; + } - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); - break; - } else if (timeout == 0) { - err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); - break; - } - } else { - ssize_t read; + copied += read; - finish_wait(sk_sleep(sk), &wait); + err = transport->notify_recv_post_dequeue(vsk, target, read, + !(flags & MSG_PEEK), &recv_data); + if (err < 0) + goto out; - if (ready < 0) { - /* Invalid queue pair content. XXX This should - * be changed to a connection reset in a later - * change. - */ + if (read >= target || flags & MSG_PEEK) + break; - err = -ENOMEM; - goto out; - } + target -= read; + } - err = transport->notify_recv_pre_dequeue( - vsk, target, &recv_data); - if (err < 0) - break; + if (sk->sk_err) + err = -sk->sk_err; + else if (sk->sk_shutdown & RCV_SHUTDOWN) + err = 0; - read = transport->stream_dequeue( - vsk, msg, - len - copied, flags); - if (read < 0) { - err = -ENOMEM; - break; - } + if (copied > 0) + err = copied; + +out: + return err; +} - copied += read; +static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + const struct vsock_transport *transport; + struct vsock_sock *vsk; + ssize_t msg_len; + long timeout; + int err = 0; + DEFINE_WAIT(wait); - err = transport->notify_recv_post_dequeue( - vsk, target, read, - !(flags & MSG_PEEK), &recv_data); - if (err < 0) - goto out; + vsk = vsock_sk(sk); + transport = vsk->transport; - if (read >= target || flags & MSG_PEEK) - break; + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - target -= read; - } + err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); + if (err <= 0) + goto out; + + msg_len = transport->seqpacket_dequeue(vsk, msg, flags); + + if (msg_len < 0) { + err = -ENOMEM; + goto out; } - if (sk->sk_err) + if (sk->sk_err) { err = -sk->sk_err; - else if (sk->sk_shutdown & RCV_SHUTDOWN) + } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; + } else { + /* User sets MSG_TRUNC, so return real length of + * packet. + */ + if (flags & MSG_TRUNC) + err = msg_len; + else + err = len - msg_data_left(msg); - if (copied > 0) - err = copied; + /* Always set MSG_TRUNC if real length of packet is + * bigger than user's buffer. + */ + if (msg_len > len) + msg->msg_flags |= MSG_TRUNC; + } + +out: + return err; +} + +static int +vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk; + struct vsock_sock *vsk; + const struct vsock_transport *transport; + int err; + + DEFINE_WAIT(wait); + + sk = sock->sk; + vsk = vsock_sk(sk); + err = 0; + + lock_sock(sk); + + transport = vsk->transport; + + if (!transport || sk->sk_state != TCP_ESTABLISHED) { + /* Recvmsg is supposed to return 0 if a peer performs an + * orderly shutdown. Differentiate between that case and when a + * peer has not connected or a local shutdown occurred with the + * SOCK_DONE flag. + */ + if (sock_flag(sk, SOCK_DONE)) + err = 0; + else + err = -ENOTCONN; + + goto out; + } + + if (flags & MSG_OOB) { + err = -EOPNOTSUPP; + goto out; + } + + /* We don't check peer_shutdown flag here since peer may actually shut + * down, but there can be data in the queue that a local socket can + * receive. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + err = 0; + goto out; + } + + /* It is valid on Linux to pass in a zero-length receive buffer. This + * is not an error. We may as well bail out now. + */ + if (!len) { + err = 0; + goto out; + } + + if (sk->sk_type == SOCK_STREAM) + err = __vsock_stream_recvmsg(sk, msg, len, flags); + else + err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); @@ -2007,7 +2129,7 @@ static const struct proto_ops vsock_stream_ops = { .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, - .connect = vsock_stream_connect, + .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, @@ -2015,10 +2137,31 @@ static const struct proto_ops vsock_stream_ops = { .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, - .setsockopt = vsock_stream_setsockopt, - .getsockopt = vsock_stream_getsockopt, - .sendmsg = vsock_stream_sendmsg, - .recvmsg = vsock_stream_recvmsg, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops vsock_seqpacket_ops = { + .family = PF_VSOCK, + .owner = THIS_MODULE, + .release = vsock_release, + .bind = vsock_bind, + .connect = vsock_connect, + .socketpair = sock_no_socketpair, + .accept = vsock_accept, + .getname = vsock_getname, + .poll = vsock_poll, + .ioctl = sock_no_ioctl, + .listen = vsock_listen, + .shutdown = vsock_shutdown, + .setsockopt = vsock_connectible_setsockopt, + .getsockopt = vsock_connectible_getsockopt, + .sendmsg = vsock_connectible_sendmsg, + .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; @@ -2043,6 +2186,9 @@ static int vsock_create(struct net *net, struct socket *sock, case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; + case SOCK_SEQPACKET: + sock->ops = &vsock_seqpacket_ops; + break; default: return -ESOCKTNOSUPPORT; } diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index cc3bae2659e7..19189cf30a72 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -596,7 +596,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, return -EOPNOTSUPP; if (need_refill) { - hvs->recv_desc = hv_pkt_iter_first(hvs->chan); + hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan); ret = hvs_update_recv_data(hvs); if (ret) return ret; @@ -610,7 +610,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, hvs->recv_data_len -= to_read; if (hvs->recv_data_len == 0) { - hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc); + hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc); if (hvs->recv_desc) { ret = hvs_update_recv_data(hvs); if (ret) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 2700a63ab095..4f7c99dfd16c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -62,6 +62,7 @@ struct virtio_vsock { struct virtio_vsock_event event_list[8]; u32 guest_cid; + bool seqpacket_allow; }; static u32 virtio_transport_get_local_cid(void) @@ -356,11 +357,14 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { - lock_sock(sk); + /* vmci_transport.c doesn't take sk_lock here either. At least we're + * under vsock_table_lock so the sock cannot disappear while we're + * executing. + */ + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; - sk->sk_error_report(sk); - release_sock(sk); + sk_error_report(sk); } static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock) @@ -443,6 +447,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +static bool virtio_transport_seqpacket_allow(u32 remote_cid); + static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, @@ -469,6 +475,11 @@ static struct virtio_transport virtio_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = virtio_transport_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -485,6 +496,21 @@ static struct virtio_transport virtio_transport = { .send_pkt = virtio_transport_send_pkt, }; +static bool virtio_transport_seqpacket_allow(u32 remote_cid) +{ + struct virtio_vsock *vsock; + bool seqpacket_allow; + + seqpacket_allow = false; + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); + if (vsock) + seqpacket_allow = vsock->seqpacket_allow; + rcu_read_unlock(); + + return seqpacket_allow; +} + static void virtio_transport_rx_work(struct work_struct *work) { struct virtio_vsock *vsock = @@ -608,10 +634,14 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->event_run = true; mutex_unlock(&vsock->event_lock); + if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET)) + vsock->seqpacket_allow = true; + vdev->priv = vsock; rcu_assign_pointer(the_virtio_vsock, vsock); mutex_unlock(&the_virtio_vsock_mutex); + return 0; out: @@ -695,6 +725,7 @@ static struct virtio_device_id id_table[] = { }; static unsigned int features[] = { + VIRTIO_VSOCK_F_SEQPACKET }; static struct virtio_driver virtio_vsock_driver = { diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 902cb6dd710b..59ee1be5a6dd 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -74,6 +74,14 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, err = memcpy_from_msg(pkt->buf, info->msg, len); if (err) goto out; + + if (msg_data_left(info->msg) == 0 && + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + + if (info->msg->msg_flags & MSG_EOR) + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + } } trace_virtio_transport_alloc_pkt(src_cid, src_port, @@ -165,6 +173,14 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); +static u16 virtio_transport_get_type(struct sock *sk) +{ + if (sk->sk_type == SOCK_STREAM) + return VIRTIO_VSOCK_TYPE_STREAM; + else + return VIRTIO_VSOCK_TYPE_SEQPACKET; +} + /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * @@ -179,6 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt; u32 pkt_len = info->pkt_len; + info->type = virtio_transport_get_type(sk_vsock(vsk)); + t_ops = virtio_transport_get_ops(vsk); if (unlikely(!t_ops)) return -EFAULT; @@ -269,13 +287,10 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) } EXPORT_SYMBOL_GPL(virtio_transport_put_credit); -static int virtio_transport_send_credit_update(struct vsock_sock *vsk, - int type, - struct virtio_vsock_hdr *hdr) +static int virtio_transport_send_credit_update(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, - .type = type, .vsk = vsk, }; @@ -383,11 +398,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * messages, we set the limit to a high value. TODO: experiment * with different values. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { - virtio_transport_send_credit_update(vsk, - VIRTIO_VSOCK_TYPE_STREAM, - NULL); - } + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + virtio_transport_send_credit_update(vsk); return total; @@ -397,6 +409,78 @@ out: return err; } +static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + int dequeued_len = 0; + size_t user_buf_len = msg_data_left(msg); + bool msg_ready = false; + + spin_lock_bh(&vvs->rx_lock); + + if (vvs->msg_count == 0) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + while (!msg_ready) { + pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); + + if (dequeued_len >= 0) { + size_t pkt_len; + size_t bytes_to_copy; + + pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); + bytes_to_copy = min(user_buf_len, pkt_len); + + if (bytes_to_copy) { + int err; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); + if (err) { + /* Copy of message failed. Rest of + * fragments will be freed without copy. + */ + dequeued_len = err; + } else { + user_buf_len -= bytes_to_copy; + } + + spin_lock_bh(&vvs->rx_lock); + } + + if (dequeued_len >= 0) + dequeued_len += pkt_len; + } + + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { + msg_ready = true; + vvs->msg_count--; + + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + } + + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + spin_unlock_bh(&vvs->rx_lock); + + virtio_transport_send_credit_update(vsk); + + return dequeued_len; +} + ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -409,6 +493,38 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); +ssize_t +virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); + +int +virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + spin_lock_bh(&vvs->tx_lock); + + if (len > vvs->peer_buf_alloc) { + spin_unlock_bh(&vvs->tx_lock); + return -EMSGSIZE; + } + + spin_unlock_bh(&vvs->tx_lock); + + return virtio_transport_stream_enqueue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue); + int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, @@ -431,6 +547,19 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); +u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + u32 msg_count; + + spin_lock_bh(&vvs->rx_lock); + msg_count = vvs->msg_count; + spin_unlock_bh(&vvs->rx_lock); + + return msg_count; +} +EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); + static s64 virtio_transport_has_space(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; @@ -496,8 +625,7 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) vvs->buf_alloc = *val; - virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, - NULL); + virtio_transport_send_credit_update(vsk); } EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); @@ -624,7 +752,6 @@ int virtio_transport_connect(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .vsk = vsk, }; @@ -636,7 +763,6 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_SHUTDOWN, - .type = VIRTIO_VSOCK_TYPE_STREAM, .flags = (mode & RCV_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? @@ -665,7 +791,6 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RW, - .type = VIRTIO_VSOCK_TYPE_STREAM, .msg = msg, .pkt_len = len, .vsk = vsk, @@ -688,7 +813,6 @@ static int virtio_transport_reset(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, - .type = VIRTIO_VSOCK_TYPE_STREAM, .reply = !!pkt, .vsk = vsk, }; @@ -848,7 +972,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - if (sk->sk_type == SOCK_STREAM) + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) remove_sock = virtio_transport_close(vsk); if (remove_sock) { @@ -890,7 +1014,7 @@ destroy: virtio_transport_reset(vsk, pkt); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; - sk->sk_error_report(sk); + sk_error_report(sk); return err; } @@ -912,6 +1036,9 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, goto out; } + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) + vvs->msg_count++; + /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. @@ -923,13 +1050,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct virtio_vsock_pkt, list); /* If there is space in the last packet queued, we copy the - * new packet in its buffer. + * new packet in its buffer. We avoid this if the last packet + * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is + * delimiter of SEQPACKET message, so 'pkt' is the first packet + * of a new message. */ - if (pkt->len <= last_pkt->buf_len - last_pkt->len) { + if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && + !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { memcpy(last_pkt->buf + last_pkt->len, pkt->buf, pkt->len); last_pkt->len += pkt->len; free_pkt = true; + last_pkt->hdr.flags |= pkt->hdr.flags; goto out; } } @@ -954,6 +1086,9 @@ virtio_transport_recv_connected(struct sock *sk, virtio_transport_recv_enqueue(vsk, pkt); sk->sk_data_ready(sk); return err; + case VIRTIO_VSOCK_OP_CREDIT_REQUEST: + virtio_transport_send_credit_update(vsk); + break; case VIRTIO_VSOCK_OP_CREDIT_UPDATE: sk->sk_write_space(sk); break; @@ -1000,7 +1135,6 @@ virtio_transport_send_response(struct vsock_sock *vsk, { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, - .type = VIRTIO_VSOCK_TYPE_STREAM, .remote_cid = le64_to_cpu(pkt->hdr.src_cid), .remote_port = le32_to_cpu(pkt->hdr.src_port), .reply = true, @@ -1096,6 +1230,12 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, return 0; } +static bool virtio_transport_valid_type(u16 type) +{ + return (type == VIRTIO_VSOCK_TYPE_STREAM) || + (type == VIRTIO_VSOCK_TYPE_SEQPACKET); +} + /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ @@ -1121,7 +1261,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, le32_to_cpu(pkt->hdr.buf_alloc), le32_to_cpu(pkt->hdr.fwd_cnt)); - if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { (void)virtio_transport_reset_no_sock(t, pkt); goto free_pkt; } @@ -1138,6 +1278,12 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, } } + if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { + (void)virtio_transport_reset_no_sock(t, pkt); + sock_put(sk); + goto free_pkt; + } + vsk = vsock_sk(sk); lock_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c99bc4ce78e2..7aef34e32bdf 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -831,7 +831,7 @@ static void vmci_transport_handle_detach(struct sock *sk) sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; - sk->sk_error_report(sk); + sk_error_report(sk); return; } sk->sk_state = TCP_CLOSE; @@ -1248,7 +1248,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, vsock_remove_pending(listener, pending); vsock_enqueue_accept(listener, pending); - /* Callers of accept() will be be waiting on the listening socket, not + /* Callers of accept() will be waiting on the listening socket, not * the pending socket. */ listener->sk_data_ready(listener); @@ -1365,7 +1365,7 @@ destroy: sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; - sk->sk_error_report(sk); + sk_error_report(sk); return err; } diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index a45f7ffca8c5..169a8cf65b39 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -63,6 +63,8 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } +static bool vsock_loopback_seqpacket_allow(u32 remote_cid); + static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, @@ -89,6 +91,11 @@ static struct virtio_transport loopback_transport = { .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, + .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, + .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, + .seqpacket_allow = vsock_loopback_seqpacket_allow, + .seqpacket_has_data = virtio_transport_seqpacket_has_data, + .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, @@ -105,6 +112,11 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; +static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +{ + return true; +} + static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 285b8076054b..869c43d4414c 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2020 Intel Corporation + * Copyright 2018-2021 Intel Corporation */ #include <linux/export.h> @@ -942,7 +942,7 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, struct ieee80211_sta_vht_cap *vht_cap; struct ieee80211_edmg *edmg_cap; u32 width, control_freq, cap; - bool support_80_80 = false; + bool ext_nss_cap, support_80_80 = false; if (WARN_ON(!cfg80211_chandef_valid(chandef))) return false; @@ -950,6 +950,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap; vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap; edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap; + ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & + IEEE80211_VHT_EXT_NSS_BW_CAPABLE; if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, @@ -1015,7 +1017,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || - u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1; + (ext_nss_cap && + u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1); if (chandef->chan->band != NL80211_BAND_6GHZ && !support_80_80) return false; fallthrough; @@ -1037,7 +1040,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ && - !(vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)) + !(ext_nss_cap && + (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK))) return false; break; default: @@ -1335,3 +1339,34 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, WARN_ON(1); } } + +bool cfg80211_any_usable_channels(struct wiphy *wiphy, + unsigned long sband_mask, + u32 prohibited_flags) +{ + int idx; + + prohibited_flags |= IEEE80211_CHAN_DISABLED; + + for_each_set_bit(idx, &sband_mask, NUM_NL80211_BANDS) { + struct ieee80211_supported_band *sband = wiphy->bands[idx]; + int chanidx; + + if (!sband) + continue; + + for (chanidx = 0; chanidx < sband->n_channels; chanidx++) { + struct ieee80211_channel *chan; + + chan = &sband->channels[chanidx]; + + if (chan->flags & prohibited_flags) + continue; + + return true; + } + } + + return false; +} +EXPORT_SYMBOL(cfg80211_any_usable_channels); diff --git a/net/wireless/core.c b/net/wireless/core.c index 8d0883e81093..03323121ca50 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -532,11 +532,11 @@ use_default_name: wiphy_net_set(&rdev->wiphy, &init_net); rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; - rdev->rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), - &rdev->wiphy.dev, RFKILL_TYPE_WLAN, - &rdev->rfkill_ops, rdev); + rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), + &rdev->wiphy.dev, RFKILL_TYPE_WLAN, + &rdev->rfkill_ops, rdev); - if (!rdev->rfkill) { + if (!rdev->wiphy.rfkill) { wiphy_free(&rdev->wiphy); return NULL; } @@ -589,14 +589,6 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) if (WARN_ON(!c->num_different_channels)) return -EINVAL; - /* - * Put a sane limit on maximum number of different - * channels to simplify channel accounting code. - */ - if (WARN_ON(c->num_different_channels > - CFG80211_MAX_NUM_DIFFERENT_CHANNELS)) - return -EINVAL; - /* DFS only works on one channel. */ if (WARN_ON(c->radar_detect_widths && (c->num_different_channels > 1))) @@ -936,9 +928,6 @@ int wiphy_register(struct wiphy *wiphy) return res; } - /* set up regulatory info */ - wiphy_regulatory_register(wiphy); - list_add_rcu(&rdev->list, &cfg80211_rdev_list); cfg80211_rdev_list_generation++; @@ -949,6 +938,9 @@ int wiphy_register(struct wiphy *wiphy) cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + /* set up regulatory info */ + wiphy_regulatory_register(wiphy); + if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; @@ -993,10 +985,10 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.registered = true; rtnl_unlock(); - res = rfkill_register(rdev->rfkill); + res = rfkill_register(rdev->wiphy.rfkill); if (res) { - rfkill_destroy(rdev->rfkill); - rdev->rfkill = NULL; + rfkill_destroy(rdev->wiphy.rfkill); + rdev->wiphy.rfkill = NULL; wiphy_unregister(&rdev->wiphy); return res; } @@ -1012,18 +1004,10 @@ void wiphy_rfkill_start_polling(struct wiphy *wiphy) if (!rdev->ops->rfkill_poll) return; rdev->rfkill_ops.poll = cfg80211_rfkill_poll; - rfkill_resume_polling(rdev->rfkill); + rfkill_resume_polling(wiphy->rfkill); } EXPORT_SYMBOL(wiphy_rfkill_start_polling); -void wiphy_rfkill_stop_polling(struct wiphy *wiphy) -{ - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - - rfkill_pause_polling(rdev->rfkill); -} -EXPORT_SYMBOL(wiphy_rfkill_stop_polling); - void wiphy_unregister(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -1035,8 +1019,8 @@ void wiphy_unregister(struct wiphy *wiphy) wiphy_unlock(&rdev->wiphy); __count == 0; })); - if (rdev->rfkill) - rfkill_unregister(rdev->rfkill); + if (rdev->wiphy.rfkill) + rfkill_unregister(rdev->wiphy.rfkill); rtnl_lock(); wiphy_lock(&rdev->wiphy); @@ -1088,7 +1072,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; - rfkill_destroy(rdev->rfkill); + rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(®->list); kfree(reg); @@ -1110,7 +1094,7 @@ void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - if (rfkill_set_hw_state_reason(rdev->rfkill, blocked, reason)) + if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); @@ -1503,7 +1487,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->use_4addr, 0)) return notifier_from_errno(-EOPNOTSUPP); - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return notifier_from_errno(-ERFKILL); break; default: diff --git a/net/wireless/core.h b/net/wireless/core.h index a7d19b4b40ac..b35d0db12f1d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -27,7 +27,6 @@ struct cfg80211_registered_device { /* rfkill support */ struct rfkill_ops rfkill_ops; - struct rfkill *rfkill; struct work_struct rfkill_block; /* ISO / IEC 3166 alpha2 for which this device is receiving diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fc9286afe3c9..bf7cd4752547 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -330,7 +330,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { }; static const struct nla_policy -nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { +nl80211_pmsr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy), [NL80211_PMSR_PEER_ATTR_REQ] = @@ -345,7 +345,7 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = { [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT }, [NL80211_PMSR_ATTR_PEERS] = - NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy), + NLA_POLICY_NESTED_ARRAY(nl80211_pmsr_peer_attr_policy), }; static const struct nla_policy @@ -759,6 +759,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT }, [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy), [NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG }, + [NL80211_ATTR_OBSS_COLOR_BITMAP] = { .type = NLA_U64 }, + [NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 }, + [NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 }, + [NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy), }; /* policy for the key attributes */ @@ -1731,6 +1735,11 @@ nl80211_send_iftype_data(struct sk_buff *msg, &iftdata->he_6ghz_capa)) return -ENOBUFS; + if (iftdata->vendor_elems.data && iftdata->vendor_elems.len && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, + iftdata->vendor_elems.len, iftdata->vendor_elems.data)) + return -ENOBUFS; + return 0; } @@ -2346,7 +2355,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, goto nla_put_failure; for (band = state->band_start; - band < NUM_NL80211_BANDS; band++) { + band < (state->split ? + NUM_NL80211_BANDS : + NL80211_BAND_60GHZ + 1); + band++) { struct ieee80211_supported_band *sband; /* omit higher bands for ancient software */ @@ -4781,11 +4793,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, sband->ht_cap.mcs.rx_mask, sizeof(mask->control[i].ht_mcs)); - if (!sband->vht_cap.vht_supported) - continue; - - vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + if (sband->vht_cap.vht_supported) { + vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + } he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); if (!he_cap) @@ -6520,8 +6531,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) err = rdev_change_station(rdev, dev, mac_addr, ¶ms); out_put_vlan: - if (params.vlan) - dev_put(params.vlan); + dev_put(params.vlan); return err; } @@ -6756,8 +6766,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) err = rdev_add_station(rdev, dev, mac_addr, ¶ms); - if (params.vlan) - dev_put(params.vlan); + dev_put(params.vlan); return err; } @@ -8482,8 +8491,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; nl80211_send_scan_start(rdev, wdev); - if (wdev->netdev) - dev_hold(wdev->netdev); + dev_hold(wdev->netdev); return 0; @@ -13042,7 +13050,7 @@ static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info) if (wdev_running(wdev)) return 0; - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; err = rdev_start_p2p_device(rdev, wdev); @@ -13084,7 +13092,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (wdev_running(wdev)) return -EEXIST; - if (rfkill_blocked(rdev->rfkill)) + if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) @@ -14796,6 +14804,106 @@ bad_tid_conf: return ret; } +static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_color_change_settings params = {}; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct nlattr **tb; + u16 offset; + int err; + + if (!rdev->ops->color_change) + return -EOPNOTSUPP; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BSS_COLOR)) + return -EOPNOTSUPP; + + if (wdev->iftype != NL80211_IFTYPE_AP) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT] || + !info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR] || + !info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS]) + return -EINVAL; + + params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]); + params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]); + + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_next); + if (err) + return err; + + tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL); + if (!tb) + return -ENOMEM; + + err = nla_parse_nested(tb, NL80211_ATTR_MAX, + info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS], + nl80211_policy, info->extack); + if (err) + goto out; + + err = nl80211_parse_beacon(rdev, tb, ¶ms.beacon_color_change); + if (err) + goto out; + + if (!tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) { + err = -EINVAL; + goto out; + } + + if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) != sizeof(u16)) { + err = -EINVAL; + goto out; + } + + offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]); + if (offset >= params.beacon_color_change.tail_len) { + err = -EINVAL; + goto out; + } + + if (params.beacon_color_change.tail[offset] != params.count) { + err = -EINVAL; + goto out; + } + + params.counter_offset_beacon = offset; + + if (tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { + if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) != + sizeof(u16)) { + err = -EINVAL; + goto out; + } + + offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]); + if (offset >= params.beacon_color_change.probe_resp_len) { + err = -EINVAL; + goto out; + } + + if (params.beacon_color_change.probe_resp[offset] != + params.count) { + err = -EINVAL; + goto out; + } + + params.counter_offset_presp = offset; + } + + wdev_lock(wdev); + err = rdev_color_change(rdev, dev, ¶ms); + wdev_unlock(wdev); + +out: + kfree(tb); + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14853,9 +14961,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, return -ENETDOWN; } - if (dev) - dev_hold(dev); - + dev_hold(dev); info->user_ptr[0] = rdev; } @@ -14877,8 +14983,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_NEED_WDEV) { struct wireless_dev *wdev = info->user_ptr[1]; - if (wdev->netdev) - dev_put(wdev->netdev); + dev_put(wdev->netdev); } else { dev_put(info->user_ptr[1]); } @@ -15794,6 +15899,14 @@ static const struct genl_small_ops nl80211_small_ops[] = { .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_COLOR_CHANGE_REQUEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = nl80211_color_change, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -17423,6 +17536,51 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); +int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp, + enum nl80211_commands cmd, u8 count, + u64 color_bitmap) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + ASSERT_WDEV_LOCK(wdev); + + trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED && + nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count)) + goto nla_put_failure; + + if (cmd == NL80211_CMD_OBSS_COLOR_COLLISION && + nla_put_u64_64bit(msg, NL80211_ATTR_OBSS_COLOR_BITMAP, + color_bitmap, NL80211_ATTR_PAD)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_MLME, gfp); + +nla_put_failure: + nlmsg_free(msg); + return -EINVAL; +} +EXPORT_SYMBOL(cfg80211_bss_color_notify); + void nl80211_radar_notify(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index d245968b74cb..328cf54bda82 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -168,6 +168,18 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]) { + if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR], + "FTM: BSS color set for EDCA based ranging"); + return -EINVAL; + } + + out->ftm.bss_color = + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); + } + return 0; } diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 36f1b59a78bf..ae2e1a896461 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -115,23 +115,22 @@ int ieee80211_radiotap_iterator_init( iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len); iterator->_arg_index = 0; iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present); - iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header); + iterator->_arg = (uint8_t *)radiotap_header->it_optional; iterator->_reset_on_ext = 0; - iterator->_next_bitmap = &radiotap_header->it_present; - iterator->_next_bitmap++; + iterator->_next_bitmap = radiotap_header->it_optional; iterator->_vns = vns; iterator->current_namespace = &radiotap_ns; iterator->is_radiotap_ns = 1; /* find payload start allowing for extended bitmap(s) */ - if (iterator->_bitmap_shifter & (1<<IEEE80211_RADIOTAP_EXT)) { + if (iterator->_bitmap_shifter & (BIT(IEEE80211_RADIOTAP_EXT))) { if ((unsigned long)iterator->_arg - (unsigned long)iterator->_rtheader + sizeof(uint32_t) > (unsigned long)iterator->_max_length) return -EINVAL; while (get_unaligned_le32(iterator->_arg) & - (1 << IEEE80211_RADIOTAP_EXT)) { + (BIT(IEEE80211_RADIOTAP_EXT))) { iterator->_arg += sizeof(uint32_t); /* diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 8b1358d04ca2..ce6bf218a1a3 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -464,8 +464,18 @@ static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { + const struct cfg80211_bss_ies *bss_ies; int ret; - trace_rdev_assoc(&rdev->wiphy, dev, req); + + /* + * Note: we might trace not exactly the data that's processed, + * due to races and the driver/mac80211 getting a newer copy. + */ + rcu_read_lock(); + bss_ies = rcu_dereference(req->bss->ies); + trace_rdev_assoc(&rdev->wiphy, dev, req, bss_ies); + rcu_read_unlock(); + ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; @@ -1358,4 +1368,17 @@ static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_color_change(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_color_change_settings *params) +{ + int ret; + + trace_rdev_color_change(&rdev->wiphy, dev, params); + ret = rdev->ops->color_change(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 0406ce7334fa..df87c7f3a049 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -171,9 +171,11 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; + enum nl80211_dfs_regions dfs_region; rcu_read_lock(); regd = get_cfg80211_regdom(); + dfs_region = regd->dfs_region; if (!wiphy) goto out; @@ -182,6 +184,11 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) if (!wiphy_regd) goto out; + if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { + dfs_region = wiphy_regd->dfs_region; + goto out; + } + if (wiphy_regd->dfs_region == regd->dfs_region) goto out; @@ -193,7 +200,7 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) out: rcu_read_unlock(); - return regd->dfs_region; + return dfs_region; } static void rcu_free_regdom(const struct ieee80211_regdomain *r) @@ -3975,7 +3982,9 @@ static int __regulatory_set_wiphy_regd(struct wiphy *wiphy, "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n")) return -EPERM; - if (WARN(!is_valid_rd(rd), "Invalid regulatory domain detected\n")) { + if (WARN(!is_valid_rd(rd), + "Invalid regulatory domain detected: %c%c\n", + rd->alpha2[0], rd->alpha2[1])) { print_regdomain_info(rd); return -EINVAL; } @@ -4049,6 +4058,7 @@ void wiphy_regulatory_register(struct wiphy *wiphy) wiphy_update_regulatory(wiphy, lr->initiator); wiphy_all_share_dfs_chan_state(wiphy); + reg_process_self_managed_hints(); } void wiphy_regulatory_deregister(struct wiphy *wiphy) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4f06c1825029..11c68b159324 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> @@ -618,7 +618,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, freq = ieee80211_channel_to_frequency(ap_info->channel, band); - if (end - pos < count * ap_info->tbtt_info_len) + if (end - pos < count * length) break; /* @@ -630,7 +630,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, if (band != NL80211_BAND_6GHZ || (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM && length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) { - pos += count * ap_info->tbtt_info_len; + pos += count * length; continue; } @@ -653,7 +653,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, kfree(entry); } - pos += ap_info->tbtt_info_len; + pos += length; } } @@ -757,7 +757,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) } request = kzalloc(struct_size(request, channels, n_channels) + - sizeof(*request->scan_6ghz_params) * count, + sizeof(*request->scan_6ghz_params) * count + + sizeof(*request->ssids) * rdev_req->n_ssids, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); @@ -848,10 +849,19 @@ skip: if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; - rdev->int_scan_req = request; /* + * Add the ssids from the parent scan request to the new scan + * request, so the driver would be able to use them in its + * probe requests to discover hidden APs on PSC channels. + */ + request->ssids = (void *)&request->channels[request->n_channels]; + request->n_ssids = rdev_req->n_ssids; + memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * + request->n_ssids); + + /* * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ @@ -965,8 +975,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, } #endif - if (wdev->netdev) - dev_put(wdev->netdev); + dev_put(wdev->netdev); kfree(rdev->int_scan_req); rdev->int_scan_req = NULL; @@ -1744,16 +1753,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { - kfree(new); + bss_ref_put(rdev, new); goto drop; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { - if (!list_empty(&new->hidden_list)) - list_del(&new->hidden_list); - kfree(new); + bss_ref_put(rdev, new); goto drop; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 76b777d5903f..19b78d472283 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1195,8 +1195,9 @@ TRACE_EVENT(rdev_auth, TRACE_EVENT(rdev_assoc, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_assoc_request *req), - TP_ARGS(wiphy, netdev, req), + struct cfg80211_assoc_request *req, + const struct cfg80211_bss_ies *bss_ies), + TP_ARGS(wiphy, netdev, req, bss_ies), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1204,6 +1205,17 @@ TRACE_EVENT(rdev_assoc, MAC_ENTRY(prev_bssid) __field(bool, use_mfp) __field(u32, flags) + __dynamic_array(u8, bss_elements, bss_ies->len) + __field(bool, bss_elements_bcon) + __field(u64, bss_elements_tsf) + __dynamic_array(u8, elements, req->ie_len) + __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap)) + __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap)) + __array(u8, vht_capa, sizeof(struct ieee80211_vht_cap)) + __array(u8, vht_capa_mask, sizeof(struct ieee80211_vht_cap)) + __dynamic_array(u8, fils_kek, req->fils_kek_len) + __dynamic_array(u8, fils_nonces, + req->fils_nonces ? 2 * FILS_NONCE_LEN : 0) ), TP_fast_assign( WIPHY_ASSIGN; @@ -1215,6 +1227,26 @@ TRACE_EVENT(rdev_assoc, MAC_ASSIGN(prev_bssid, req->prev_bssid); __entry->use_mfp = req->use_mfp; __entry->flags = req->flags; + if (bss_ies->len) + memcpy(__get_dynamic_array(bss_elements), + bss_ies->data, bss_ies->len); + __entry->bss_elements_bcon = bss_ies->from_beacon; + __entry->bss_elements_tsf = bss_ies->tsf; + if (req->ie) + memcpy(__get_dynamic_array(elements), + req->ie, req->ie_len); + memcpy(__entry->ht_capa, &req->ht_capa, sizeof(req->ht_capa)); + memcpy(__entry->ht_capa_mask, &req->ht_capa_mask, + sizeof(req->ht_capa_mask)); + memcpy(__entry->vht_capa, &req->vht_capa, sizeof(req->vht_capa)); + memcpy(__entry->vht_capa_mask, &req->vht_capa_mask, + sizeof(req->vht_capa_mask)); + if (req->fils_kek) + memcpy(__get_dynamic_array(fils_kek), + req->fils_kek, req->fils_kek_len); + if (req->fils_nonces) + memcpy(__get_dynamic_array(fils_nonces), + req->fils_nonces, 2 * FILS_NONCE_LEN); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", previous bssid: " MAC_PR_FMT ", use mfp: %s, flags: %u", @@ -3565,6 +3597,52 @@ TRACE_EVENT(rdev_set_sar_specs, WIPHY_PR_ARG, __entry->type, __entry->num) ); +TRACE_EVENT(rdev_color_change, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_color_change_settings *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u8, count) + __field(u16, bcn_ofs) + __field(u16, pres_ofs) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->count = params->count; + __entry->bcn_ofs = params->counter_offset_beacon; + __entry->pres_ofs = params->counter_offset_presp; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT + ", count: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->count) +); + +TRACE_EVENT(cfg80211_bss_color_notify, + TP_PROTO(struct net_device *netdev, + enum nl80211_commands cmd, + u8 count, u64 color_bitmap), + TP_ARGS(netdev, cmd, count, color_bitmap), + TP_STRUCT__entry( + NETDEV_ENTRY + __field(u32, cmd) + __field(u8, count) + __field(u64, color_bitmap) + ), + TP_fast_assign( + NETDEV_ASSIGN; + __entry->cmd = cmd; + __entry->count = count; + __entry->color_bitmap = color_bitmap; + ), + TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx", + NETDEV_PR_ARG, __entry->cmd, __entry->count, + __entry->color_bitmap) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a8320dc59af7..a32065d600a1 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -902,7 +902,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, /* only change when not disabling */ if (!data->txpower.disabled) { - rfkill_set_sw_state(rdev->rfkill, false); + rfkill_set_sw_state(rdev->wiphy.rfkill, false); if (data->txpower.fixed) { /* @@ -927,7 +927,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, } } } else { - if (rfkill_set_sw_state(rdev->rfkill, true)) + if (rfkill_set_sw_state(rdev->wiphy.rfkill, true)) schedule_work(&rdev->rfkill_block); return 0; } @@ -963,7 +963,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, /* well... oh well */ data->txpower.fixed = 1; - data->txpower.disabled = rfkill_blocked(rdev->rfkill); + data->txpower.disabled = rfkill_blocked(rdev->wiphy.rfkill); data->txpower.value = val; data->txpower.flags = IW_TXPOW_DBM; @@ -1167,7 +1167,7 @@ static int cfg80211_wext_siwpower(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - bool ps = wdev->ps; + bool ps; int timeout = wdev->ps_timeout; int err; diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c index 33bef22e44e9..b379a0371653 100644 --- a/net/wireless/wext-spy.c +++ b/net/wireless/wext-spy.c @@ -120,8 +120,8 @@ int iw_handler_set_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(spydata->spy_thr_low), &(threshold->low), - 2 * sizeof(struct iw_quality)); + spydata->spy_thr_low = threshold->low; + spydata->spy_thr_high = threshold->high; /* Clear flag */ memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); @@ -147,8 +147,8 @@ int iw_handler_get_thrspy(struct net_device * dev, return -EOPNOTSUPP; /* Just do it */ - memcpy(&(threshold->low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold->low = spydata->spy_thr_low; + threshold->high = spydata->spy_thr_high; return 0; } @@ -173,10 +173,10 @@ static void iw_send_thrspy_event(struct net_device * dev, memcpy(threshold.addr.sa_data, address, ETH_ALEN); threshold.addr.sa_family = ARPHRD_ETHER; /* Copy stats */ - memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + threshold.qual = *wstats; /* Copy also thresholds */ - memcpy(&(threshold.low), &(spydata->spy_thr_low), - 2 * sizeof(struct iw_quality)); + threshold.low = spydata->spy_thr_low; + threshold.high = spydata->spy_thr_high; /* Send event to user space */ wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 1816899499ce..3583354a7d7f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -366,7 +366,7 @@ static void x25_destroy_timer(struct timer_list *t) /* * This is called from user mode and the timers. Thus it protects itself - * against interrupt users but doesn't worry about being called during + * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c index d48ad6d29197..21b30b56e889 100644 --- a/net/x25/x25_forward.c +++ b/net/x25/x25_forward.c @@ -19,7 +19,6 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; - struct list_head *entry; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; @@ -46,8 +45,7 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - x25_frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; @@ -92,15 +90,13 @@ out_no_route: int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; - struct list_head *entry; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); - list_for_each(entry, &x25_forward_list) { - frwd = list_entry(entry, struct x25_forward, node); + list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 57a81100c5da..5460b9146dd8 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -332,12 +332,9 @@ void x25_link_device_down(struct net_device *dev) struct x25_neigh *x25_get_neigh(struct net_device *dev) { struct x25_neigh *nb, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_neigh_list_lock); - list_for_each(entry, &x25_neigh_list) { - nb = list_entry(entry, struct x25_neigh, node); - + list_for_each_entry(nb, &x25_neigh_list, node) { if (nb->dev == dev) { use = nb; break; diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 9fbe4bb38d94..647f325ed867 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -27,14 +27,11 @@ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits) goto out; @@ -78,14 +75,11 @@ static int x25_del_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; - struct list_head *entry; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits && rt->dev == dev) { __x25_remove_route(rt); @@ -141,13 +135,10 @@ struct net_device *x25_dev_get(char *devname) struct x25_route *x25_get_route(struct x25_address *addr) { struct x25_route *rt, *use = NULL; - struct list_head *entry; read_lock_bh(&x25_route_list_lock); - list_for_each(entry, &x25_route_list) { - rt = list_entry(entry, struct x25_route, node); - + list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, addr, rt->sigdigits)) { if (!use) use = rt; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 56a28a686988..f01ef6bda390 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -27,7 +27,7 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; } @@ -99,8 +99,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), - GFP_KERNEL | __GFP_NOWARN); + umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; @@ -123,7 +122,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) out_pin: xdp_umem_unpin_pages(umem); out_pgs: - kfree(umem->pgs); + kvfree(umem->pgs); umem->pgs = NULL; return err; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd62d4ba87a9..d6b500dc4208 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs) } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, - struct xdp_sock ***map_entry) + struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; @@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ - struct xdp_sock **map_entry = NULL; + struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { @@ -1313,7 +1313,7 @@ static int xsk_notifier(struct notifier_block *this, if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); xsk_unbind_dev(xs); diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index edcf249ad1f1..a4bc4749faac 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 { struct xsk_map_node { struct list_head node; struct xsk_map *map; - struct xdp_sock **map_entry; + struct xdp_sock __rcu **map_entry; }; static inline struct xdp_sock *xdp_sk(struct sock *sk) @@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); + struct xdp_sock __rcu **map_entry); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9d2a89d793c0..9ae13cccfb28 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -128,12 +128,15 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 chunk; - - if (desc->len > pool->chunk_size) - return false; + u64 chunk, chunk_end; chunk = xp_aligned_extract_addr(pool, desc->addr); + if (likely(desc->len)) { + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len - 1); + if (chunk != chunk_end) + return false; + } + if (chunk >= pool->addrs_cnt) return false; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 67b4ce504852..2e48d0e094d9 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -12,7 +12,7 @@ #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; @@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) } static void xsk_map_sock_delete(struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; @@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or + * by local_bh_disable() (from XDP calls inside NAPI). The + * rcu_read_lock_bh_held() below makes lockdep accept both. + */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); @@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) if (key >= map->max_entries) return NULL; - return READ_ONCE(m->xsk_map[key]); + return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - WARN_ON_ONCE(!rcu_read_lock_held()); return __xsk_map_lookup_elem(map, *(u32 *)key); } @@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs, *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; @@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } spin_lock_bh(&m->lock); - old_xs = READ_ONCE(*map_entry); + old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; @@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, goto out; } xsk_map_sock_add(xs, node); - WRITE_ONCE(*map_entry, xs); + rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -208,7 +212,8 @@ out: static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs, **map_entry; + struct xdp_sock __rcu **map_entry; + struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) @@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; - old_xs = xchg(map_entry, NULL); + old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); @@ -226,15 +231,16 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + __xsk_map_lookup_elem); } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry) + struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); - if (READ_ONCE(*map_entry) == xs) { - WRITE_ONCE(*map_entry, NULL); + if (rcu_access_pointer(*map_entry) == xs) { + rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index a20aec9d7393..2bf269390163 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -298,8 +298,16 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src) len = nlmsg_attrlen(nlh_src, xfrm_msg_min[type]); nla_for_each_attr(nla, attrs, len, remaining) { - int err = xfrm_xlate64_attr(dst, nla); + int err; + switch (type) { + case XFRM_MSG_NEWSPDINFO: + err = xfrm_nla_cpy(dst, nla, nla_len(nla)); + break; + default: + err = xfrm_xlate64_attr(dst, nla); + break; + } if (err) return err; } @@ -341,7 +349,8 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src /* Calculates len of translated 64-bit message. */ static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src, - struct nlattr *attrs[XFRMA_MAX+1]) + struct nlattr *attrs[XFRMA_MAX + 1], + int maxtype) { size_t len = nlmsg_len(src); @@ -358,10 +367,20 @@ static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src, case XFRM_MSG_POLEXPIRE: len += 8; break; + case XFRM_MSG_NEWSPDINFO: + /* attirbutes are xfrm_spdattr_type_t, not xfrm_attr_type_t */ + return len; default: break; } + /* Unexpected for anything, but XFRM_MSG_NEWSPDINFO, please + * correct both 64=>32-bit and 32=>64-bit translators to copy + * new attributes. + */ + if (WARN_ON_ONCE(maxtype)) + return len; + if (attrs[XFRMA_SA]) len += 4; if (attrs[XFRMA_POLICY]) @@ -440,7 +459,8 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, static int xfrm_xlate32(struct nlmsghdr *dst, const struct nlmsghdr *src, struct nlattr *attrs[XFRMA_MAX+1], - size_t size, u8 type, struct netlink_ext_ack *extack) + size_t size, u8 type, int maxtype, + struct netlink_ext_ack *extack) { size_t pos; int i; @@ -520,6 +540,25 @@ static int xfrm_xlate32(struct nlmsghdr *dst, const struct nlmsghdr *src, } pos = dst->nlmsg_len; + if (maxtype) { + /* attirbutes are xfrm_spdattr_type_t, not xfrm_attr_type_t */ + WARN_ON_ONCE(src->nlmsg_type != XFRM_MSG_NEWSPDINFO); + + for (i = 1; i <= maxtype; i++) { + int err; + + if (!attrs[i]) + continue; + + /* just copy - no need for translation */ + err = xfrm_attr_cpy32(dst, &pos, attrs[i], size, + nla_len(attrs[i]), nla_len(attrs[i])); + if (err) + return err; + } + return 0; + } + for (i = 1; i < XFRMA_MAX + 1; i++) { int err; @@ -564,7 +603,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, if (err < 0) return ERR_PTR(err); - len = xfrm_user_rcv_calculate_len64(h32, attrs); + len = xfrm_user_rcv_calculate_len64(h32, attrs, maxtype); /* The message doesn't need translation */ if (len == nlmsg_len(h32)) return NULL; @@ -574,7 +613,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, if (!h64) return ERR_PTR(-ENOMEM); - err = xfrm_xlate32(h64, h32, attrs, len, type, extack); + err = xfrm_xlate32(h64, h32, attrs, len, type, maxtype, extack); if (err < 0) { kvfree(h64); return ERR_PTR(err); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6d6917b68856..e843b0d9e2a6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -268,6 +268,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->num_exthdrs = 0; xso->flags = 0; xso->dev = NULL; + xso->real_dev = NULL; dev_put(dev); if (err != -EOPNOTSUPP) diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index ce66323102f9..d12bb906c9c9 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -131,6 +131,13 @@ __xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, return (h ^ (h >> 10) ^ (h >> 20)) & hmask; } +static inline unsigned int +__xfrm_seq_hash(u32 seq, unsigned int hmask) +{ + unsigned int h = seq; + return (h ^ (h >> 10) ^ (h >> 20)) & hmask; +} + static inline unsigned int __idx_hash(u32 index, unsigned int hmask) { return (index ^ (index >> 8)) & hmask; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1158cd0311d7..3df0861d4390 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -612,7 +612,7 @@ lock: goto drop_unlock; } - if (x->repl->check(x, skb, seq)) { + if (xfrm_replay_check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } @@ -660,12 +660,12 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (x->repl->recheck(x, skb, seq)) { + if (xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } - x->repl->advance(x, seq); + xfrm_replay_advance(x, seq); x->curlft.bytes += skb->len; x->curlft.packets++; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 8831f5a9e992..41de46b5ffa9 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -432,6 +432,7 @@ static int xfrmi4_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; + break; case ICMP_REDIRECT: break; default: diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 2e8afe078d61..cb40ff0ff28d 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -241,7 +241,7 @@ static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms) break; } - WARN_ON(!pos); + WARN_ON(list_entry_is_head(pos, &ipcomp_tfms_list, list)); if (--pos->users) return; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4cb0ff4dcf4..229544bc70c2 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -77,6 +77,83 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static int mip6_rthdr_offset(struct sk_buff *skb, u8 **nexthdr, int type) +{ + const unsigned char *nh = skb_network_header(skb); + unsigned int offset = sizeof(struct ipv6hdr); + unsigned int packet_len; + int found_rhdr = 0; + + packet_len = skb_tail_pointer(skb) - nh; + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; + + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + if (type == IPPROTO_ROUTING && offset + 3 <= packet_len) { + struct ipv6_rt_hdr *rt; + + rt = (struct ipv6_rt_hdr *)(nh + offset); + if (rt->type != 0) + return offset; + } + found_rhdr = 1; + break; + case NEXTHDR_DEST: + /* HAO MUST NOT appear more than once. + * XXX: It is better to try to find by the end of + * XXX: packet if HAO exists. + */ + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { + net_dbg_ratelimited("mip6: hao exists already, override\n"); + return offset; + } + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); + offset += ipv6_optlen(exthdr); + if (offset > IPV6_MAXPLEN) + return -EINVAL; + *nexthdr = &exthdr->nexthdr; + } + + return -EINVAL; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int xfrm6_hdr_offset(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr) +{ + switch (x->type->proto) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + return mip6_rthdr_offset(skb, prevhdr, x->type->proto); +#endif + default: + break; + } + + return ip6_find_1stfragopt(skb, prevhdr); +} +#endif + /* Add encapsulation header. * * The IP header and mutable extension headers will be moved forward to make @@ -92,7 +169,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, @@ -122,7 +199,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) iph = ipv6_hdr(skb); - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) return hdr_len; skb_set_mac_header(skb, @@ -448,7 +525,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error; } - err = x->repl->overflow(x, skb); + err = xfrm_replay_overflow(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR); goto error; @@ -565,6 +642,42 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb return 0; } +/* For partial checksum offload, the outer header checksum is calculated + * by software and the inner header checksum is calculated by hardware. + * This requires hardware to know the inner packet type to calculate + * the inner header checksum. Save inner ip protocol here to avoid + * traversing the packet in the vendor's xmit code. + * If the encap type is IPIP, just save skb->inner_ipproto. Otherwise, + * get the ip protocol from the IP header. + */ +static void xfrm_get_inner_ipproto(struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + const struct ethhdr *eth; + + if (!xo) + return; + + if (skb->inner_protocol_type == ENCAP_TYPE_IPPROTO) { + xo->inner_ipproto = skb->inner_ipproto; + return; + } + + if (skb->inner_protocol_type != ENCAP_TYPE_ETHER) + return; + + eth = (struct ethhdr *)skb_inner_mac_header(skb); + + switch (ntohs(eth->h_proto)) { + case ETH_P_IPV6: + xo->inner_ipproto = inner_ipv6_hdr(skb)->nexthdr; + break; + case ETH_P_IP: + xo->inner_ipproto = inner_ip_hdr(skb)->protocol; + break; + } +} + int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -594,12 +707,15 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return -ENOMEM; } - skb->encapsulation = 1; sp->olen++; sp->xvec[sp->len++] = x; xfrm_state_hold(x); + if (skb->encapsulation) + xfrm_get_inner_ipproto(skb); + skb->encapsulation = 1; + if (skb_is_gso(skb)) { if (skb->inner_protocol) return xfrm_output_gso(net, sk, skb); @@ -711,15 +827,8 @@ out: static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) - unsigned int ptr = 0; int err; - if (x->outer_mode.encap == XFRM_MODE_BEET && - ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL) >= 0) { - net_warn_ratelimited("BEET mode doesn't support inner IPv6 fragments\n"); - return -EAFNOSUPPORT; - } - err = xfrm6_tunnel_check_size(skb); if (err) return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ce500f847b99..37d17a79617c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -155,7 +155,6 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; -static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; @@ -585,7 +584,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); - write_seqcount_begin(&xfrm_policy_hash_generation); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); @@ -596,7 +595,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; - write_seqcount_end(&xfrm_policy_hash_generation); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); @@ -1245,7 +1244,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); - write_seqcount_begin(&xfrm_policy_hash_generation); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. @@ -1354,7 +1353,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) out_unlock: __xfrm_policy_inexact_flush(net); - write_seqcount_end(&xfrm_policy_hash_generation); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -1902,8 +1901,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, match = xfrm_selector_match(sel, fl, family); if (match) - ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, - dir); + ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } @@ -2095,9 +2093,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, rcu_read_lock(); retry: do { - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { @@ -2128,7 +2126,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } skip_inexact: - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) @@ -2181,8 +2179,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, goto out; } err = security_xfrm_policy_lookup(pol->security, - fl->flowi_secid, - dir); + fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; @@ -3160,6 +3157,11 @@ ok: return dst; nopol: + if (!(dst_orig->dev->flags & IFF_LOOPBACK) && + !xfrm_default_allow(net, dir)) { + err = -EPERM; + goto error; + } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; @@ -3247,7 +3249,7 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, /* * 0 or more than 0 is returned when validation is succeeded (either bypass - * because of optional transport mode, or next index of the mathced secpath + * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. @@ -3548,6 +3550,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) { + if (!xfrm_default_allow(net, dir)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); + return 0; + } + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); @@ -3602,6 +3609,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; + + if (!xfrm_default_allow(net, dir) && !xfrm_nr) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + goto reject; + } + if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; @@ -4084,6 +4097,7 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); + seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); rv = xfrm_statistics_init(net); @@ -4128,7 +4142,6 @@ void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); - seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index c6a4338a0d08..9277d81b344c 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -34,8 +34,11 @@ u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq) return seq_hi; } EXPORT_SYMBOL(xfrm_replay_seqhi); -; -static void xfrm_replay_notify(struct xfrm_state *x, int event) + +static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event); +static void xfrm_replay_notify_esn(struct xfrm_state *x, int event); + +void xfrm_replay_notify(struct xfrm_state *x, int event) { struct km_event c; /* we send notify messages in case @@ -48,6 +51,17 @@ static void xfrm_replay_notify(struct xfrm_state *x, int event) * The state structure must be locked! */ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + xfrm_replay_notify_bmp(x, event); + return; + case XFRM_REPLAY_MODE_ESN: + xfrm_replay_notify_esn(x, event); + return; + } + switch (event) { case XFRM_REPLAY_UPDATE: if (!x->replay_maxdiff || @@ -81,7 +95,7 @@ static void xfrm_replay_notify(struct xfrm_state *x, int event) x->xflags &= ~XFRM_TIME_DEFER; } -static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +static int __xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; struct net *net = xs_net(x); @@ -98,14 +112,14 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) return err; } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; } -static int xfrm_replay_check(struct xfrm_state *x, - struct sk_buff *skb, __be32 net_seq) +static int xfrm_replay_check_legacy(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) { u32 diff; u32 seq = ntohl(net_seq); @@ -136,14 +150,26 @@ err: return -EINVAL; } -static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) +static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq); +static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq); + +void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) { - u32 diff; - u32 seq = ntohl(net_seq); + u32 diff, seq; + + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_advance_bmp(x, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_advance_esn(x, net_seq); + } if (!x->props.replay_window) return; + seq = ntohl(net_seq); if (seq > x->replay.seq) { diff = seq - x->replay.seq; if (diff < x->props.replay_window) @@ -157,7 +183,7 @@ static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) } if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) @@ -178,7 +204,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) return err; } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -273,7 +299,7 @@ static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event) @@ -416,7 +442,7 @@ static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb) } } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -481,6 +507,21 @@ err: return -EINVAL; } +int xfrm_replay_check(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_check_bmp(x, skb, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_check_esn(x, skb, net_seq); + } + + return xfrm_replay_check_legacy(x, skb, net_seq); +} + static int xfrm_replay_recheck_esn(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { @@ -493,6 +534,22 @@ static int xfrm_replay_recheck_esn(struct xfrm_state *x, return xfrm_replay_check_esn(x, skb, net_seq); } +int xfrm_replay_recheck(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + /* no special recheck treatment */ + return xfrm_replay_check_bmp(x, skb, net_seq); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_recheck_esn(x, skb, net_seq); + } + + return xfrm_replay_check_legacy(x, skb, net_seq); +} + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) { unsigned int bitnr, nr, i; @@ -548,7 +605,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) replay_esn->bmp[nr] |= (1U << bitnr); if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } #ifdef CONFIG_XFRM_OFFLOAD @@ -560,7 +617,7 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk __u32 oseq = x->replay.oseq; if (!xo) - return xfrm_replay_overflow(x, skb); + return __xfrm_replay_overflow(x, skb); if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { if (!skb_is_gso(skb)) { @@ -585,7 +642,7 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk x->replay.oseq = oseq; if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -625,7 +682,7 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff } if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; @@ -674,59 +731,39 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff replay_esn->oseq = oseq; if (xfrm_aevent_is_on(net)) - x->repl->notify(x, XFRM_REPLAY_UPDATE); + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } return err; } -static const struct xfrm_replay xfrm_replay_legacy = { - .advance = xfrm_replay_advance, - .check = xfrm_replay_check, - .recheck = xfrm_replay_check, - .notify = xfrm_replay_notify, - .overflow = xfrm_replay_overflow_offload, -}; - -static const struct xfrm_replay xfrm_replay_bmp = { - .advance = xfrm_replay_advance_bmp, - .check = xfrm_replay_check_bmp, - .recheck = xfrm_replay_check_bmp, - .notify = xfrm_replay_notify_bmp, - .overflow = xfrm_replay_overflow_offload_bmp, -}; - -static const struct xfrm_replay xfrm_replay_esn = { - .advance = xfrm_replay_advance_esn, - .check = xfrm_replay_check_esn, - .recheck = xfrm_replay_recheck_esn, - .notify = xfrm_replay_notify_esn, - .overflow = xfrm_replay_overflow_offload_esn, -}; +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_overflow_offload_bmp(x, skb); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_overflow_offload_esn(x, skb); + } + + return xfrm_replay_overflow_offload(x, skb); +} #else -static const struct xfrm_replay xfrm_replay_legacy = { - .advance = xfrm_replay_advance, - .check = xfrm_replay_check, - .recheck = xfrm_replay_check, - .notify = xfrm_replay_notify, - .overflow = xfrm_replay_overflow, -}; - -static const struct xfrm_replay xfrm_replay_bmp = { - .advance = xfrm_replay_advance_bmp, - .check = xfrm_replay_check_bmp, - .recheck = xfrm_replay_check_bmp, - .notify = xfrm_replay_notify_bmp, - .overflow = xfrm_replay_overflow_bmp, -}; - -static const struct xfrm_replay xfrm_replay_esn = { - .advance = xfrm_replay_advance_esn, - .check = xfrm_replay_check_esn, - .recheck = xfrm_replay_recheck_esn, - .notify = xfrm_replay_notify_esn, - .overflow = xfrm_replay_overflow_esn, -}; +int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->repl_mode) { + case XFRM_REPLAY_MODE_LEGACY: + break; + case XFRM_REPLAY_MODE_BMP: + return xfrm_replay_overflow_bmp(x, skb); + case XFRM_REPLAY_MODE_ESN: + return xfrm_replay_overflow_esn(x, skb); + } + + return __xfrm_replay_overflow(x, skb); +} #endif int xfrm_init_replay(struct xfrm_state *x) @@ -741,12 +778,12 @@ int xfrm_init_replay(struct xfrm_state *x) if (x->props.flags & XFRM_STATE_ESN) { if (replay_esn->replay_window == 0) return -EINVAL; - x->repl = &xfrm_replay_esn; + x->repl_mode = XFRM_REPLAY_MODE_ESN; } else { - x->repl = &xfrm_replay_bmp; + x->repl_mode = XFRM_REPLAY_MODE_BMP; } } else { - x->repl = &xfrm_replay_legacy; + x->repl_mode = XFRM_REPLAY_MODE_LEGACY; } return 0; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 4496f7efa220..a2f4001221d1 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -78,10 +78,16 @@ xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } +static unsigned int xfrm_seq_hash(struct net *net, u32 seq) +{ + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); +} + static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, struct hlist_head *nspitable, + struct hlist_head *nseqtable, unsigned int nhashmask) { struct hlist_node *tmp; @@ -106,6 +112,11 @@ static void xfrm_hash_transfer(struct hlist_head *list, nhashmask); hlist_add_head_rcu(&x->byspi, nspitable + h); } + + if (x->km.seq) { + h = __xfrm_seq_hash(x->km.seq, nhashmask); + hlist_add_head_rcu(&x->byseq, nseqtable + h); + } } } @@ -117,7 +128,7 @@ static unsigned long xfrm_hash_new_size(unsigned int state_hmask) static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); - struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi; + struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq; unsigned long nsize, osize; unsigned int nhashmask, ohashmask; int i; @@ -137,6 +148,13 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(nsrc, nsize); return; } + nseq = xfrm_hash_alloc(nsize); + if (!nseq) { + xfrm_hash_free(ndst, nsize); + xfrm_hash_free(nsrc, nsize); + xfrm_hash_free(nspi, nsize); + return; + } spin_lock_bh(&net->xfrm.xfrm_state_lock); write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); @@ -144,15 +162,17 @@ static void xfrm_hash_resize(struct work_struct *work) nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) - xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask); + xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask); osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); + oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net); ohashmask = net->xfrm.state_hmask; rcu_assign_pointer(net->xfrm.state_bydst, ndst); rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); rcu_assign_pointer(net->xfrm.state_byspi, nspi); + rcu_assign_pointer(net->xfrm.state_byseq, nseq); net->xfrm.state_hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_state_hash_generation); @@ -165,6 +185,7 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); + xfrm_hash_free(oseq, osize); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); @@ -621,6 +642,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); + INIT_HLIST_NODE(&x->byseq); hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); x->mtimer.function = xfrm_timer_handler; timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); @@ -664,6 +686,8 @@ int __xfrm_state_delete(struct xfrm_state *x) list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); + if (x->km.seq) + hlist_del_rcu(&x->byseq); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1148,6 +1172,10 @@ found: h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } + if (x->km.seq) { + h = xfrm_seq_hash(net, x->km.seq); + hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), @@ -1263,6 +1291,12 @@ static void __xfrm_state_insert(struct xfrm_state *x) hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } + if (x->km.seq) { + h = xfrm_seq_hash(net, x->km.seq); + + hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + } + hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); @@ -1932,20 +1966,18 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) { - int i; - - for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct xfrm_state *x; + unsigned int h = xfrm_seq_hash(net, seq); + struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { - if (x->km.seq == seq && - (mark & x->mark.m) == x->mark.v && - x->km.state == XFRM_STATE_ACQ) { - xfrm_state_hold(x); - return x; - } + hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { + if (x->km.seq == seq && + (mark & x->mark.m) == x->mark.v && + x->km.state == XFRM_STATE_ACQ) { + xfrm_state_hold(x); + return x; } } + return NULL; } @@ -2145,7 +2177,7 @@ static void xfrm_replay_timer_handler(struct timer_list *t) if (x->km.state == XFRM_STATE_VALID) { if (xfrm_aevent_is_on(xs_net(x))) - x->repl->notify(x, XFRM_REPLAY_TIMEOUT); + xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT); else x->xflags |= XFRM_TIME_DEFER; } @@ -2518,7 +2550,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2549,7 +2581,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(xfrm_state_mtu); +EXPORT_SYMBOL_GPL(__xfrm_state_mtu); + +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + mtu = __xfrm_state_mtu(x, mtu); + + if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) + return IPV6_MIN_MTU; + + return mtu; +} int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { @@ -2660,6 +2702,9 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_byspi = xfrm_hash_alloc(sz); if (!net->xfrm.state_byspi) goto out_byspi; + net->xfrm.state_byseq = xfrm_hash_alloc(sz); + if (!net->xfrm.state_byseq) + goto out_byseq; net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; @@ -2669,6 +2714,8 @@ int __net_init xfrm_state_init(struct net *net) &net->xfrm.xfrm_state_lock); return 0; +out_byseq: + xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: xfrm_hash_free(net->xfrm.state_bysrc, sz); out_bysrc: @@ -2688,6 +2735,8 @@ void xfrm_state_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.state_all)); sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); + WARN_ON(!hlist_empty(net->xfrm.state_byseq)); + xfrm_hash_free(net->xfrm.state_byseq, sz); WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f0aecee4d539..03b66d154b2b 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -580,6 +580,20 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, copy_from_user_state(x, p); + if (attrs[XFRMA_ENCAP]) { + x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), + sizeof(*x->encap), GFP_KERNEL); + if (x->encap == NULL) + goto error; + } + + if (attrs[XFRMA_COADDR]) { + x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), + sizeof(*x->coaddr), GFP_KERNEL); + if (x->coaddr == NULL) + goto error; + } + if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); @@ -600,23 +614,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_COMP]))) goto error; - if (attrs[XFRMA_ENCAP]) { - x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), - sizeof(*x->encap), GFP_KERNEL); - if (x->encap == NULL) - goto error; - } - if (attrs[XFRMA_TFCPAD]) x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); - if (attrs[XFRMA_COADDR]) { - x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), - sizeof(*x->coaddr), GFP_KERNEL); - if (x->coaddr == NULL) - goto error; - } - xfrm_mark_get(attrs, &x->mark); xfrm_smark_init(attrs, &x->props.smark); @@ -1961,6 +1961,59 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, return skb; } +static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_userpolicy_default *up = nlmsg_data(nlh); + u8 dirmask; + u8 old_default = net->xfrm.policy_default; + + if (up->dirmask >= XFRM_USERPOLICY_DIRMASK_MAX) + return -EINVAL; + + dirmask = (1 << up->dirmask) & XFRM_POL_DEFAULT_MASK; + + net->xfrm.policy_default = (old_default & (0xff ^ dirmask)) + | (up->action << up->dirmask); + + rt_genid_bump_all(net); + + return 0; +} + +static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct sk_buff *r_skb; + struct nlmsghdr *r_nlh; + struct net *net = sock_net(skb->sk); + struct xfrm_userpolicy_default *r_up, *up; + int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); + u32 portid = NETLINK_CB(skb).portid; + u32 seq = nlh->nlmsg_seq; + + up = nlmsg_data(nlh); + + r_skb = nlmsg_new(len, GFP_ATOMIC); + if (!r_skb) + return -ENOMEM; + + r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0); + if (!r_nlh) { + kfree_skb(r_skb); + return -EMSGSIZE; + } + + r_up = nlmsg_data(r_nlh); + + r_up->action = ((net->xfrm.policy_default & (1 << up->dirmask)) >> up->dirmask); + r_up->dirmask = up->dirmask; + nlmsg_end(r_skb, r_nlh); + + return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); +} + static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2664,6 +2717,8 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), + [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; EXPORT_SYMBOL_GPL(xfrm_msg_min); @@ -2743,6 +2798,8 @@ static const struct xfrm_link { .nla_pol = xfrma_spd_policy, .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, + [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_set_default }, + [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_get_default }, }; static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2811,6 +2868,16 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = link->doit(skb, nlh, attrs); + /* We need to free skb allocated in xfrm_alloc_compat() before + * returning from this function, because consume_skb() won't take + * care of frag_list since netlink destructor sets + * sbk->head to NULL. (see netlink_skb_destructor()) + */ + if (skb_has_frag_list(skb)) { + kfree_skb(skb_shinfo(skb)->frag_list); + skb_shinfo(skb)->frag_list = NULL; + } + err: kvfree(nlh64); return err; |
