diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-12 18:57:02 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-12 18:57:02 -0800 |
| commit | aee3bfa3307cd0da2126bdc0ea359dabea5ee8f7 (patch) | |
| tree | 3d35c69e8fa835098bb90f77f30abed120681651 /net/core | |
| parent | c597b6bcd5c624534afc3df65cdc42bb05173bca (diff) | |
| parent | 415b6f19e87e350b13585591859d4fdf50772229 (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from Davic Miller:
1) Support busy polling generically, for all NAPI drivers. From Eric
Dumazet.
2) Add byte/packet counter support to nft_ct, from Floriani Westphal.
3) Add RSS/XPS support to mvneta driver, from Gregory Clement.
4) Implement IPV6_HDRINCL socket option for raw sockets, from Hannes
Frederic Sowa.
5) Add support for T6 adapter to cxgb4 driver, from Hariprasad Shenai.
6) Add support for VLAN device bridging to mlxsw switch driver, from
Ido Schimmel.
7) Add driver for Netronome NFP4000/NFP6000, from Jakub Kicinski.
8) Provide hwmon interface to mlxsw switch driver, from Jiri Pirko.
9) Reorganize wireless drivers into per-vendor directories just like we
do for ethernet drivers. From Kalle Valo.
10) Provide a way for administrators "destroy" connected sockets via the
SOCK_DESTROY socket netlink diag operation. From Lorenzo Colitti.
11) Add support to add/remove multicast routes via netlink, from Nikolay
Aleksandrov.
12) Make TCP keepalive settings per-namespace, from Nikolay Borisov.
13) Add forwarding and packet duplication facilities to nf_tables, from
Pablo Neira Ayuso.
14) Dead route support in MPLS, from Roopa Prabhu.
15) TSO support for thunderx chips, from Sunil Goutham.
16) Add driver for IBM's System i/p VNIC protocol, from Thomas Falcon.
17) Rationalize, consolidate, and more completely document the checksum
offloading facilities in the networking stack. From Tom Herbert.
18) Support aborting an ongoing scan in mac80211/cfg80211, from
Vidyullatha Kanchanapally.
19) Use per-bucket spinlock for bpf hash facility, from Tom Leiming.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1375 commits)
net: bnxt: always return values from _bnxt_get_max_rings
net: bpf: reject invalid shifts
phonet: properly unshare skbs in phonet_rcv()
dwc_eth_qos: Fix dma address for multi-fragment skbs
phy: remove an unneeded condition
mdio: remove an unneed condition
mdio_bus: NULL dereference on allocation error
net: Fix typo in netdev_intersect_features
net: freescale: mac-fec: Fix build error from phy_device API change
net: freescale: ucc_geth: Fix build error from phy_device API change
bonding: Prevent IPv6 link local address on enslaved devices
IB/mlx5: Add flow steering support
net/mlx5_core: Export flow steering API
net/mlx5_core: Make ipv4/ipv6 location more clear
net/mlx5_core: Enable flow steering support for the IB driver
net/mlx5_core: Initialize namespaces only when supported by device
net/mlx5_core: Set priority attributes
net/mlx5_core: Connect flow tables
net/mlx5_core: Introduce modify flow table command
net/mlx5_core: Managing root flow table
...
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 2 | ||||
| -rw-r--r-- | net/core/datagram.c | 77 | ||||
| -rw-r--r-- | net/core/dev.c | 414 | ||||
| -rw-r--r-- | net/core/ethtool.c | 85 | ||||
| -rw-r--r-- | net/core/filter.c | 299 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 5 | ||||
| -rw-r--r-- | net/core/net-traces.c | 4 | ||||
| -rw-r--r-- | net/core/netclassid_cgroup.c | 11 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 19 | ||||
| -rw-r--r-- | net/core/pktgen.c | 8 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 21 | ||||
| -rw-r--r-- | net/core/scm.c | 4 | ||||
| -rw-r--r-- | net/core/sock.c | 54 | ||||
| -rw-r--r-- | net/core/sock_diag.c | 23 | ||||
| -rw-r--r-- | net/core/sock_reuseport.c | 251 | ||||
| -rw-r--r-- | net/core/stream.c | 2 |
16 files changed, 1093 insertions, 186 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 086b01fbe1bd..0b835de04de3 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/datagram.c b/net/core/datagram.c index d62af69ad844..fa9dc6450b08 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn /* * Wait for the last received packet to be different from skb */ -static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, - const struct sk_buff *skb) +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -130,6 +130,7 @@ out_noerr: error = 1; goto out; } +EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { @@ -161,13 +162,15 @@ done: } /** - * __skb_recv_datagram - Receive a datagram skbuff + * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned + * @last: set to last peeked message to inform the wait function + * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and @@ -175,9 +178,11 @@ done: * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * - * This function will lock the socket if a skb is returned, so the caller - * needs to unlock the socket in that case (usually by calling - * skb_free_datagram) + * This function will lock the socket if a skb is returned, so + * the caller needs to unlock the socket in that case (usually by + * calling skb_free_datagram). Returns NULL with *err set to + * -EAGAIN if no data was available or to some other value if an + * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve @@ -191,13 +196,13 @@ done: * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, - int *peeked, int *off, int *err) +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err, + struct sk_buff **last) { struct sk_buff_head *queue = &sk->sk_receive_queue; - struct sk_buff *skb, *last; + struct sk_buff *skb; unsigned long cpu_flags; - long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ @@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, */ int _off = *off; - last = (struct sk_buff *)queue; + *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { - last = skb; + *last = skb; *peeked = skb->peeked; if (flags & MSG_PEEK) { if (_off >= skb->len && (skb->len || _off || @@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, skb = skb_set_peeked(skb); error = PTR_ERR(skb); - if (IS_ERR(skb)) - goto unlock_err; + if (IS_ERR(skb)) { + spin_unlock_irqrestore(&queue->lock, + cpu_flags); + goto no_packet; + } atomic_inc(&skb->users); } else @@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, *off = _off; return skb; } + spin_unlock_irqrestore(&queue->lock, cpu_flags); + } while (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)); - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)) - continue; + error = -EAGAIN; - /* User doesn't want to wait */ - error = -EAGAIN; - if (!timeo) - goto no_packet; +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_try_recv_datagram); - } while (!wait_for_more_packets(sk, err, &timeo, last)); +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err) +{ + struct sk_buff *skb, *last; + long timeo; - return NULL; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, + &last); + if (skb) + return skb; + + if (*err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, err, &timeo, last)); -unlock_err: - spin_unlock_irqrestore(&queue->lock, cpu_flags); -no_packet: - *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index ae00b894e675..0ca95d5d7af0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dst.h> @@ -137,6 +138,7 @@ #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> +#include <linux/sctp.h> #include "net-sysfs.h" @@ -182,8 +184,8 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; -static DEFINE_HASHTABLE(napi_hash, 8); +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -1674,6 +1676,22 @@ void net_dec_ingress_queue(void) EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif +#ifdef CONFIG_NET_EGRESS +static struct static_key egress_needed __read_mostly; + +void net_inc_egress_queue(void) +{ + static_key_slow_inc(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_key_slow_dec(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -2470,6 +2488,141 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +/* skb_csum_offload_check - Driver helper function to determine if a device + * with limited checksum offload capabilities is able to offload the checksum + * for a given packet. + * + * Arguments: + * skb - sk_buff for the packet in question + * spec - contains the description of what device can offload + * csum_encapped - returns true if the checksum being offloaded is + * encpasulated. That is it is checksum for the transport header + * in the inner headers. + * checksum_help - when set indicates that helper function should + * call skb_checksum_help if offload checks fail + * + * Returns: + * true: Packet has passed the checksum checks and should be offloadable to + * the device (a driver may still need to check for additional + * restrictions of its device) + * false: Checksum is not offloadable. If checksum_help was set then + * skb_checksum_help was called to resolve checksum for non-GSO + * packets and when IP protocol is not SCTP + */ +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + struct iphdr *iph; + struct ipv6hdr *ipv6; + void *nhdr; + int protocol; + u8 ip_proto; + + if (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)) { + if (!spec->vlan_okay) + goto need_help; + } + + /* We check whether the checksum refers to a transport layer checksum in + * the outermost header or an encapsulated transport layer checksum that + * corresponds to the inner headers of the skb. If the checksum is for + * something else in the packet we need help. + */ + if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { + /* Non-encapsulated checksum */ + protocol = eproto_to_ipproto(vlan_get_protocol(skb)); + nhdr = skb_network_header(skb); + *csum_encapped = false; + if (spec->no_not_encapped) + goto need_help; + } else if (skb->encapsulation && spec->encap_okay && + skb_checksum_start_offset(skb) == + skb_inner_transport_offset(skb)) { + /* Encapsulated checksum */ + *csum_encapped = true; + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = eproto_to_ipproto(skb->inner_protocol); + break; + case ENCAP_TYPE_IPPROTO: + protocol = skb->inner_protocol; + break; + } + nhdr = skb_inner_network_header(skb); + } else { + goto need_help; + } + + switch (protocol) { + case IPPROTO_IP: + if (!spec->ipv4_okay) + goto need_help; + iph = nhdr; + ip_proto = iph->protocol; + if (iph->ihl != 5 && !spec->ip_options_okay) + goto need_help; + break; + case IPPROTO_IPV6: + if (!spec->ipv6_okay) + goto need_help; + if (spec->no_encapped_ipv6 && *csum_encapped) + goto need_help; + ipv6 = nhdr; + nhdr += sizeof(*ipv6); + ip_proto = ipv6->nexthdr; + break; + default: + goto need_help; + } + +ip_proto_again: + switch (ip_proto) { + case IPPROTO_TCP: + if (!spec->tcp_okay || + skb->csum_offset != offsetof(struct tcphdr, check)) + goto need_help; + break; + case IPPROTO_UDP: + if (!spec->udp_okay || + skb->csum_offset != offsetof(struct udphdr, check)) + goto need_help; + break; + case IPPROTO_SCTP: + if (!spec->sctp_okay || + skb->csum_offset != offsetof(struct sctphdr, checksum)) + goto cant_help; + break; + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 *opthdr = nhdr; + + if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) + goto need_help; + + ip_proto = opthdr[0]; + nhdr += (opthdr[1] + 1) << 3; + + goto ip_proto_again; + } + default: + goto need_help; + } + + /* Passed the tests for offloading checksum */ + return true; + +need_help: + if (csum_help && !skb_shinfo(skb)->gso_size) + skb_checksum_help(skb); +cant_help: + return false; +} +EXPORT_SYMBOL(__skb_csum_offload_chk); + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -2644,7 +2797,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2791,7 +2944,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && + if (!(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -2870,7 +3023,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -2928,7 +3080,8 @@ static void skb_update_prio(struct sk_buff *skb) struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + unsigned int prioidx = + sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; @@ -2962,6 +3115,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); + struct tcf_result cl_res; + + if (!cl) + return skb; + + /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set + * earlier by the caller. + */ + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); + *ret = NET_XMIT_DROP; + goto drop; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *ret = NET_XMIT_SUCCESS; +drop: + kfree_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } + + return skb; +} +#endif /* CONFIG_NET_EGRESS */ + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -3021,7 +3217,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -3086,6 +3284,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + qdisc_pkt_len_init(skb); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +# ifdef CONFIG_NET_EGRESS + if (static_key_false(&egress_needed)) { + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + } +# endif +#endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ @@ -3107,9 +3316,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); -#endif trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -3666,9 +3872,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); @@ -3862,7 +4068,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; @@ -4353,6 +4559,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4386,7 +4593,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } @@ -4661,7 +4871,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4672,43 +4882,101 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); -void napi_hash_add(struct napi_struct *napi) +#if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 +bool sk_busy_loop(struct sock *sk, int nonblock) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*busy_poll)(struct napi_struct *dev); + struct napi_struct *napi; + int rc = false; - spin_lock(&napi_hash_lock); + rcu_read_lock(); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; + + do { + rc = 0; + local_bh_disable(); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); } + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ - spin_unlock(&napi_hash_lock); - } + cpu_relax(); + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +void napi_hash_add(struct napi_struct *napi) +{ + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; + + spin_lock(&napi_hash_lock); + + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); @@ -4744,6 +5012,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4763,8 +5032,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -5351,7 +5624,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *private) + void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5375,6 +5648,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, changeupper_info.upper_dev = upper_dev; changeupper_info.master = master; changeupper_info.linking = true; + changeupper_info.upper_info = upper_info; ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, &changeupper_info.info); @@ -5382,7 +5656,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; @@ -5420,8 +5694,12 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, - &changeupper_info.info); + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback_lower_mesh; + return 0; rollback_lower_mesh: @@ -5475,7 +5753,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -5483,6 +5761,8 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5491,20 +5771,14 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv, void *upper_info) { - return __netdev_upper_dev_link(dev, upper_dev, true, NULL); + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info); } EXPORT_SYMBOL(netdev_master_upper_dev_link); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private) -{ - return __netdev_upper_dev_link(dev, upper_dev, true, private); -} -EXPORT_SYMBOL(netdev_master_upper_dev_link_private); - /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -5663,7 +5937,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)) + bool (*type_check)(const struct net_device *dev)) { struct net_device *lower = NULL; struct list_head *iter; @@ -5685,6 +5959,26 @@ int dev_get_nest_level(struct net_device *dev, } EXPORT_SYMBOL(dev_get_nest_level); +/** + * netdev_lower_change - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6375,9 +6669,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + if (!(features & NETIF_F_HW_CSUM) && + ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { netdev_dbg(dev, "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; @@ -7164,11 +7458,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); @@ -7477,16 +7773,16 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + if (mask & NETIF_F_HW_CSUM) + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 29edf74846fc..daf04709dd3c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -87,7 +87,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", [NETIF_F_RXHASH_BIT] = "rx-hashing", @@ -191,6 +191,23 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) return ret; } +static int phy_get_sset_count(struct phy_device *phydev) +{ + int ret; + + if (phydev->drv->get_sset_count && + phydev->drv->get_strings && + phydev->drv->get_stats) { + mutex_lock(&phydev->lock); + ret = phydev->drv->get_sset_count(phydev); + mutex_unlock(&phydev->lock); + + return ret; + } + + return -EOPNOTSUPP; +} + static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -204,6 +221,13 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); + if (sset == ETH_SS_PHY_STATS) { + if (dev->phydev) + return phy_get_sset_count(dev->phydev); + else + return -EOPNOTSUPP; + } + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -223,7 +247,17 @@ static void __ethtool_get_strings(struct net_device *dev, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); - else + else if (stringset == ETH_SS_PHY_STATS) { + struct phy_device *phydev = dev->phydev; + + if (phydev) { + mutex_lock(&phydev->lock); + phydev->drv->get_strings(phydev, data); + mutex_unlock(&phydev->lock); + } else { + return; + } + } else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } @@ -235,7 +269,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; @@ -1401,6 +1435,47 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + struct phy_device *phydev = dev->phydev; + u64 *data; + int ret, n_stats; + + if (!phydev) + return -EOPNOTSUPP; + + n_stats = phy_get_sset_count(phydev); + + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + mutex_lock(&phydev->lock); + phydev->drv->get_stats(phydev, &stats, data); + mutex_unlock(&phydev->lock); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; @@ -1779,6 +1854,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: + case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1991,6 +2067,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; + case ETHTOOL_GPHYSTATS: + rc = ethtool_get_phy_stats(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 672eefbfbe99..94d26201080d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -50,6 +50,7 @@ #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> +#include <net/sock_reuseport.h> /** * sk_filter - run a packet through a socket filter @@ -348,12 +349,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * jump offsets, 2nd pass remapping: * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); - * - * User BPF's register A is mapped to our BPF register 6, user BPF - * register X is mapped to BPF register 7; frame pointer is always - * register 10; Context 'void *ctx' is stored in register 1, that is, - * for socket filters: ctx == 'struct sk_buff *', for seccomp: - * ctx == 'struct seccomp_data *'. */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len) @@ -381,9 +376,22 @@ do_pass: new_insn = new_prog; fp = prog; - if (new_insn) - *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); - new_insn++; + /* Classic BPF related prologue emission. */ + if (new_insn) { + /* Classic BPF expects A and X to be reset first. These need + * to be guaranteed to be the first two instructions. + */ + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + /* All programs must keep CTX in callee saved BPF_REG_CTX. + * In eBPF case it's done by the compiler, here we need to + * do this ourself. Initial CTX is present in BPF_REG_ARG1. + */ + *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + } else { + new_insn += 3; + } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[6] = { }; @@ -777,6 +785,11 @@ static int bpf_check_classic(const struct sock_filter *filter, if (ftest->k == 0) return -EINVAL; break; + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + if (ftest->k >= 32) + return -EINVAL; + break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: @@ -1160,17 +1173,32 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -/** - * sk_attach_filter - attach a socket filter - * @fprog: the filter program - * @sk: the socket to use - * - * Attach the user's filter code. We first run some sanity checks on - * it to make sure it does not explode on us later. If an error - * occurs or there is insufficient memory for the filter a negative - * errno code is returned. On success the return is zero. - */ -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct bpf_prog *old_prog; + int err; + + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + return -ENOMEM; + + if (sk_unhashed(sk)) { + err = reuseport_alloc(sk); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + old_prog = reuseport_attach_prog(sk, prog); + if (old_prog) + bpf_prog_destroy(old_prog); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); unsigned int bpf_fsize = bpf_prog_size(fprog->len); @@ -1178,19 +1206,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) - return -EINVAL; + return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); - return -EFAULT; + return ERR_PTR(-EFAULT); } prog->len = fprog->len; @@ -1198,13 +1226,30 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog, NULL); + return bpf_prepare_filter(prog, NULL); +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1218,23 +1263,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); -int sk_attach_bpf(u32 ufd, struct sock *sk) +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct bpf_prog *prog; + struct bpf_prog *prog = __get_filter(fprog, sk); int err; + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); prog = bpf_prog_get(ufd); if (IS_ERR(prog)) - return PTR_ERR(prog); + return prog; if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); - return -EINVAL; + return ERR_PTR(-EINVAL); } + return prog; +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); @@ -1244,7 +1316,24 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + +#define BPF_LDST_LEN 16U static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { @@ -1252,9 +1341,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[16]; + char buf[BPF_LDST_LEN]; void *ptr; + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + return -EINVAL; + /* bpf verifier guarantees that: * 'from' pointer points to bpf program stack * 'len' bytes of it were initialized @@ -1274,7 +1366,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely(!ptr)) return -EFAULT; - if (BPF_RECOMPUTE_CSUM(flags)) + if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpull_rcsum(skb, ptr, len); memcpy(ptr, from, len); @@ -1283,8 +1375,9 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); - if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + if (flags & BPF_F_RECOMPUTE_CSUM) + skb_postpush_rcsum(skb, ptr, len); + return 0; } @@ -1299,8 +1392,35 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) -#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) +static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; + int offset = (int) r2; + void *to = (void *)(unsigned long) r3; + unsigned int len = (unsigned int) r4; + void *ptr; + + if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + return -EFAULT; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +} + +const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { @@ -1308,6 +1428,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1319,7 +1441,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: csum_replace2(ptr, from, to); break; @@ -1351,10 +1473,12 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1366,7 +1490,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1395,13 +1519,14 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) - static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; struct net_device *dev; + if (unlikely(flags & ~(BPF_F_INGRESS))) + return -EINVAL; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -1410,8 +1535,12 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - if (BPF_IS_REDIRECT_INGRESS(flags)) + if (flags & BPF_F_INGRESS) { + if (skb_at_tc_ingress(skb2)) + skb_postpush_rcsum(skb2, skb_mac_header(skb2), + skb2->mac_len); return dev_forward_skb(dev, skb2); + } skb2->dev = dev; skb_sender_cpu_clear(skb2); @@ -1433,12 +1562,17 @@ struct redirect_info { }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); + static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + if (unlikely(flags & ~(BPF_F_INGRESS))) + return TC_ACT_SHOT; + ri->ifindex = ifindex; ri->flags = flags; + return TC_ACT_REDIRECT; } @@ -1454,8 +1588,12 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (BPF_IS_REDIRECT_INGRESS(ri->flags)) + if (ri->flags & BPF_F_INGRESS) { + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), + skb->mac_len); return dev_forward_skb(dev, skb); + } skb->dev = dev; skb_sender_cpu_clear(skb); @@ -1547,19 +1685,49 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) - return -EINVAL; - if (ip_tunnel_info_af(info) != AF_INET) + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) return -EINVAL; + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) + return -EPROTO; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + to = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + + if (flags & BPF_F_TUNINFO_IPV6) + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + else + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy((void *)(long) r2, to, size); return 0; } @@ -1581,10 +1749,25 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1592,9 +1775,19 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + } return 0; } @@ -1654,6 +1847,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f88a62ab019d..b6c8a6629b39 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -471,6 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct switchdev_attr attr = { + .orig_dev = netdev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1452,8 +1453,8 @@ static void netdev_release(struct device *d) static const void *net_namespace(struct device *d) { - struct net_device *dev; - dev = container_of(d, struct net_device, dev); + struct net_device *dev = to_net_dev(d); + return dev_net(dev); } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index adef015b2f41..92da5e4ceb4f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -32,6 +32,10 @@ #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/fib.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <trace/events/fib6.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index d9ee8d08a3a6..0260c84ed83c 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } @@ -100,6 +103,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, { struct cgroup_cls_state *cs = css_cls_state(css); + cgroup_sk_alloc_disable(); + cs->classid = (u32)value; update_classid(css, (void *)(unsigned long)cs->classid); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 40fd09fe06ae..f1efbc39ef6b 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,6 +27,12 @@ #include <linux/fdtable.h> +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX + #define PRIOMAP_MIN_SZ 128 /* @@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) struct net_device *dev; int ret = 0; + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + if (!parent_css) return 0; @@ -200,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; + cgroup_sk_alloc_disable(); + rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -213,8 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index de8d5cc5eb24..1474cfd2dc1c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2787,7 +2787,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, } else { skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + if (likely(skb)) + skb_reserve(skb, LL_RESERVED_SPACE(dev)); return skb; } @@ -2898,7 +2900,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V4_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; udp4_hwcsum(skb, iph->saddr, iph->daddr); @@ -3032,7 +3034,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V6_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34ba7a08876d..d735e854f916 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1027,6 +1027,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct switchdev_attr attr = { + .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -2563,7 +2564,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, - int nlflags) + int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; @@ -2579,7 +2580,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; @@ -2600,7 +2601,8 @@ static inline size_t rtnl_fdb_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); } -static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type) +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, + u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2611,7 +2613,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, - 0, 0, type, NTF_SELF, 0); + 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; @@ -2746,7 +2748,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) nlh->nlmsg_flags); if (!err) { - rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2847,7 +2850,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { - rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2875,7 +2879,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, - NLM_F_MULTI); + NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: @@ -3347,7 +3351,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); rtnl_doit_func doit; - int sz_idx, kind; + int kind; int family; int type; int err; @@ -3363,7 +3367,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; - sz_idx = type>>2; kind = type&3; if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) diff --git a/net/core/scm.c b/net/core/scm.c index 8a1741b14302..14596fb37172 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/sock.c b/net/core/sock.c index 0d91f7dca751..51270238e269 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,7 @@ #include <linux/sock_diag.h> #include <linux/filter.h> +#include <net/sock_reuseport.h> #include <trace/events/sock.h> @@ -932,6 +933,32 @@ set_rcvbuf: } break; + case SO_ATTACH_REUSEPORT_CBPF: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_reuseport_attach_filter(&fprog, sk); + } + break; + + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1362,6 +1389,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1384,6 +1412,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) owner = prot->owner; slab = prot->slab; + cgroup_sk_free(&sk->sk_cgrp_data); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1392,17 +1421,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1431,8 +1449,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); } return sk; @@ -1452,6 +1470,8 @@ void sk_destruct(struct sock *sk) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -2281,7 +2301,7 @@ static void sock_def_wakeup(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } @@ -2292,7 +2312,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); @@ -2304,7 +2324,7 @@ static void sock_def_readable(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -2322,7 +2342,7 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 0c1d58d43f67..a996ce8c8fb2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; struct sock_diag_req *req = nlmsg_data(nlh); @@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; - else + else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); + else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) + err = hndl->destroy(skb, nlh); + else + err = -EOPNOTSUPP; mutex_unlock(&sock_diag_table_mutex); return err; @@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return ret; case SOCK_DIAG_BY_FAMILY: - return __sock_diag_rcv_msg(skb, nlh); + case SOCK_DESTROY: + return __sock_diag_cmd(skb, nlh); default: return -EINVAL; } @@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group) return 0; } +int sock_diag_destroy(struct sock *sk, int err) +{ + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, err); +} +EXPORT_SYMBOL_GPL(sock_diag_destroy); + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..1df98c557440 --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,251 @@ +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +static DEFINE_SPINLOCK(reuseport_lock); + +static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +{ + size_t size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk) +{ + struct sock_reuseport *reuse; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "multiple allocations for the same socket"); + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + + reuse->socks[0] = sk; + reuse->num_socks = 1; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) + return NULL; + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->max_socks = more_socks_size; + more_reuse->num_socks = reuse->num_socks; + more_reuse->prog = reuse->prog; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + + for (i = 0; i < reuse->num_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "socket already in reuseport group"); + + if (reuse->num_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + int i; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + for (i = 0; i < reuse->num_socks; i++) { + if (reuse->socks[i] == sk) { + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + if (reuse->num_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + break; + } + } + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + kfree_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in reuseport_add_sock() */ + smp_rmb(); + + if (prog && skb) + sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + else + sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +struct bpf_prog * +reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + return old_prog; +} +EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/core/stream.c b/net/core/stream.c index b96f7a79e544..159516a11b7e 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) |
