diff options
Diffstat (limited to 'net')
216 files changed, 9158 insertions, 4543 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index f5ffc02729d6..9c95e8e054f9 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -110,39 +110,6 @@ static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) return skb; } -static void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) -{ - __be16 proto; - unsigned char *rawp; - - /* - * Was a VLAN packet, grab the encapsulated protocol, which the layer - * three protocols care about. - */ - - proto = vhdr->h_vlan_encapsulated_proto; - if (ntohs(proto) >= 1536) { - skb->protocol = proto; - return; - } - - rawp = skb->data; - if (*(unsigned short *) rawp == 0xFFFF) - /* - * This is a magic hack to spot IPX packets. Older Novell - * breaks the protocol design and runs IPX over 802.3 without - * an 802.2 LLC layer. We look for FFFF which isn't a used - * 802.2 SSAP/DSAP. This won't work for fault tolerant netware - * but does for the rest. - */ - skb->protocol = htons(ETH_P_802_3); - else - /* - * Real 802.2 LLC - */ - skb->protocol = htons(ETH_P_802_2); -} - struct sk_buff *vlan_untag(struct sk_buff *skb) { struct vlan_hdr *vhdr; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index bc2528624583..2b5fcde1f629 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -591,18 +591,17 @@ static void vlan_dev_uninit(struct net_device *dev) } } -static u32 vlan_dev_fix_features(struct net_device *dev, u32 features) +static netdev_features_t vlan_dev_fix_features(struct net_device *dev, + netdev_features_t features) { struct net_device *real_dev = vlan_dev_info(dev)->real_dev; u32 old_features = features; - features &= real_dev->features; features &= real_dev->vlan_features; + features |= NETIF_F_RXCSUM; + features &= real_dev->features; features |= old_features & NETIF_F_SOFT_FEATURES; - - if (dev_ethtool_get_rx_csum(real_dev)) - features |= NETIF_F_RXCSUM; features |= NETIF_F_LLTX; return features; diff --git a/net/Kconfig b/net/Kconfig index a07314844238..e07272d0bb2d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -215,6 +215,7 @@ source "net/sched/Kconfig" source "net/dcb/Kconfig" source "net/dns_resolver/Kconfig" source "net/batman-adv/Kconfig" +source "net/openvswitch/Kconfig" config RPS boolean @@ -232,6 +233,19 @@ config XPS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS default y +config NETPRIO_CGROUP + tristate "Network priority cgroup" + depends on CGROUPS + ---help--- + Cgroup subsystem for use in assigning processes to network priorities on + a per-interface basis + +config BQL + boolean + depends on SYSFS + select DQL + default y + config HAVE_BPF_JIT bool diff --git a/net/Makefile b/net/Makefile index acdde4950de4..ad432fa4d934 100644 --- a/net/Makefile +++ b/net/Makefile @@ -69,3 +69,4 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ +obj-$(CONFIG_OPENVSWITCH) += openvswitch/ diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c index f41f02656ff4..876fbe83e2e4 100644 --- a/net/atm/atm_misc.c +++ b/net/atm/atm_misc.c @@ -26,7 +26,7 @@ struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc, int pdu_size, gfp_t gfp_flags) { struct sock *sk = sk_atm(vcc); - int guess = atm_guess_pdu2truesize(pdu_size); + int guess = SKB_TRUESIZE(pdu_size); atm_force_charge(vcc, guess); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { diff --git a/net/atm/br2684.c b/net/atm/br2684.c index d07223c834af..353fccf1cde3 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -53,6 +53,7 @@ static const unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 }; static const unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 }; static const unsigned char llc_oui_pid_pad[] = { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED }; +static const unsigned char pad[] = { PAD_BRIDGED }; static const unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 }; static const unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 }; @@ -202,7 +203,10 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev, { struct br2684_dev *brdev = BRPRIV(dev); struct atm_vcc *atmvcc; - int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2; + int minheadroom = (brvcc->encaps == e_llc) ? + ((brdev->payload == p_bridged) ? + sizeof(llc_oui_pid_pad) : sizeof(llc_oui_ipv4)) : + ((brdev->payload == p_bridged) ? BR2684_PAD_LEN : 0); if (skb_headroom(skb) < minheadroom) { struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom); @@ -450,7 +454,7 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; } else { /* p_bridged */ /* first 2 chars should be 0 */ - if (*((u16 *) (skb->data)) != 0) + if (memcmp(skb->data, pad, BR2684_PAD_LEN) != 0) goto error; skb_pull(skb, BR2684_PAD_LEN); skb->protocol = eth_type_trans(skb, net_dev); @@ -489,15 +493,11 @@ free_skb: */ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) { - struct sk_buff_head queue; - int err; struct br2684_vcc *brvcc; - struct sk_buff *skb, *tmp; - struct sk_buff_head *rq; struct br2684_dev *brdev; struct net_device *net_dev; struct atm_backend_br2684 be; - unsigned long flags; + int err; if (copy_from_user(&be, arg, sizeof be)) return -EFAULT; @@ -550,23 +550,6 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) atmvcc->push = br2684_push; atmvcc->pop = br2684_pop; - __skb_queue_head_init(&queue); - rq = &sk_atm(atmvcc)->sk_receive_queue; - - spin_lock_irqsave(&rq->lock, flags); - skb_queue_splice_init(rq, &queue); - spin_unlock_irqrestore(&rq->lock, flags); - - skb_queue_walk_safe(&queue, skb, tmp) { - struct net_device *dev; - - br2684_push(atmvcc, skb); - dev = skb->dev; - - dev->stats.rx_bytes -= skb->len; - dev->stats.rx_packets--; - } - /* initialize netdev carrier state */ if (atmvcc->dev->signal == ATM_PHY_SIG_LOST) netif_carrier_off(net_dev); @@ -574,6 +557,10 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) netif_carrier_on(net_dev); __module_get(THIS_MODULE); + + /* re-process everything received between connection setup and + backend setup */ + vcc_process_recv_queue(atmvcc); return 0; error: @@ -600,6 +587,7 @@ static void br2684_setup(struct net_device *netdev) struct br2684_dev *brdev = BRPRIV(netdev); ether_setup(netdev); + netdev->hard_header_len += sizeof(llc_oui_pid_pad); /* worst case */ brdev->net_dev = netdev; netdev->netdev_ops = &br2684_netdev_ops; @@ -612,7 +600,7 @@ static void br2684_setup_routed(struct net_device *netdev) struct br2684_dev *brdev = BRPRIV(netdev); brdev->net_dev = netdev; - netdev->hard_header_len = 0; + netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */ netdev->netdev_ops = &br2684_netdev_ops_routed; netdev->addr_len = 0; netdev->mtu = 1500; diff --git a/net/atm/clip.c b/net/atm/clip.c index 852394072fa1..c12c2582457c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <net/route.h> /* for struct rtable and routing */ #include <net/icmp.h> /* icmp_send */ +#include <net/arp.h> #include <linux/param.h> /* for HZ */ #include <linux/uaccess.h> #include <asm/byteorder.h> /* for htons etc. */ @@ -119,7 +120,7 @@ out: /* The neighbour entry n->lock is held. */ static int neigh_check_cb(struct neighbour *n) { - struct atmarp_entry *entry = NEIGH2ENTRY(n); + struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; for (cv = entry->vccs; cv; cv = cv->next) { @@ -189,6 +190,13 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) struct clip_vcc *clip_vcc = CLIP_VCC(vcc); pr_debug("\n"); + + if (!clip_devs) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + return; + } + if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) @@ -255,8 +263,10 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { + __be32 *ip = (__be32 *) neigh->primary_key; + pr_debug("(neigh %p, skb %p)\n", neigh, skb); - to_atmarpd(act_need, PRIV(neigh->dev)->number, NEIGH2ENTRY(neigh)->ip); + to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip); } static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) @@ -277,72 +287,24 @@ static const struct neigh_ops clip_neigh_ops = { static int clip_constructor(struct neighbour *neigh) { - struct atmarp_entry *entry = NEIGH2ENTRY(neigh); - struct net_device *dev = neigh->dev; - struct in_device *in_dev; - struct neigh_parms *parms; + struct atmarp_entry *entry = neighbour_priv(neigh); - pr_debug("(neigh %p, entry %p)\n", neigh, entry); - neigh->type = inet_addr_type(&init_net, entry->ip); - if (neigh->type != RTN_UNICAST) + if (neigh->tbl->family != AF_INET) return -EINVAL; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (!in_dev) { - rcu_read_unlock(); + if (neigh->type != RTN_UNICAST) return -EINVAL; - } - - parms = in_dev->arp_parms; - __neigh_parms_put(neigh->parms); - neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); + neigh->nud_state = NUD_NONE; neigh->ops = &clip_neigh_ops; - neigh->output = neigh->nud_state & NUD_VALID ? - neigh->ops->connected_output : neigh->ops->output; + neigh->output = neigh->ops->output; entry->neigh = neigh; entry->vccs = NULL; entry->expires = jiffies - 1; + return 0; } -static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd) -{ - return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd); -} - -static struct neigh_table clip_tbl = { - .family = AF_INET, - .entry_size = sizeof(struct neighbour)+sizeof(struct atmarp_entry), - .key_len = 4, - .hash = clip_hash, - .constructor = clip_constructor, - .id = "clip_arp_cache", - - /* parameters are copied from ARP ... */ - .parms = { - .tbl = &clip_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, - }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, -}; - /* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ /* @@ -376,28 +338,19 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, dev->stats.tx_dropped++; return NETDEV_TX_OK; } - n = dst_get_neighbour(dst); + n = dst_get_neighbour_noref(dst); if (!n) { -#if 0 - n = clip_find_neighbour(skb_dst(skb), 1); - if (!n) { - dev_kfree_skb(skb); /* lost that one */ - dev->stats.tx_dropped++; - return 0; - } - dst_set_neighbour(dst, n); -#endif pr_err("NO NEIGHBOUR !\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } - entry = NEIGH2ENTRY(n); + entry = neighbour_priv(n); if (!entry->vccs) { if (time_after(jiffies, entry->expires)) { /* should be resolved */ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; - to_atmarpd(act_need, PRIV(dev)->number, entry->ip); + to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key)); } if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) skb_queue_tail(&entry->neigh->arp_queue, skb); @@ -448,10 +401,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, static int clip_mkip(struct atm_vcc *vcc, int timeout) { - struct sk_buff_head *rq, queue; struct clip_vcc *clip_vcc; - struct sk_buff *skb, *tmp; - unsigned long flags; if (!vcc->push) return -EBADFD; @@ -472,29 +422,9 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout) vcc->push = clip_push; vcc->pop = clip_pop; - __skb_queue_head_init(&queue); - rq = &sk_atm(vcc)->sk_receive_queue; - - spin_lock_irqsave(&rq->lock, flags); - skb_queue_splice_init(rq, &queue); - spin_unlock_irqrestore(&rq->lock, flags); - /* re-process everything received between connection setup and MKIP */ - skb_queue_walk_safe(&queue, skb, tmp) { - if (!clip_devs) { - atm_return(vcc, skb->truesize); - kfree_skb(skb); - } else { - struct net_device *dev = skb->dev; - unsigned int len = skb->len; - - skb_get(skb); - clip_push(vcc, skb); - dev->stats.rx_packets--; - dev->stats.rx_bytes -= len; - kfree_skb(skb); - } - } + vcc_process_recv_queue(vcc); + return 0; } @@ -523,11 +453,11 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) rt = ip_route_output(&init_net, ip, 0, 1, 0); if (IS_ERR(rt)) return PTR_ERR(rt); - neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1); + neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; - entry = NEIGH2ENTRY(neigh); + entry = neighbour_priv(neigh); if (entry != clip_vcc->entry) { if (!clip_vcc->entry) pr_debug("add\n"); @@ -544,13 +474,15 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) } static const struct net_device_ops clip_netdev_ops = { - .ndo_start_xmit = clip_start_xmit, + .ndo_start_xmit = clip_start_xmit, + .ndo_neigh_construct = clip_constructor, }; static void clip_setup(struct net_device *dev) { dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; + dev->neigh_priv_len = sizeof(struct atmarp_entry); dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; dev->tx_queue_len = 100; /* "normal" queue (packets) */ @@ -604,10 +536,8 @@ static int clip_device_event(struct notifier_block *this, unsigned long event, if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - if (event == NETDEV_UNREGISTER) { - neigh_ifdown(&clip_tbl, dev); + if (event == NETDEV_UNREGISTER) return NOTIFY_DONE; - } /* ignore non-CLIP devices */ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) @@ -787,9 +717,10 @@ static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) /* This means the neighbour entry has no attached VCC objects. */ #define SEQ_NO_VCC_TOKEN ((void *) 2) -static void atmarp_info(struct seq_file *seq, struct net_device *dev, +static void atmarp_info(struct seq_file *seq, struct neighbour *n, struct atmarp_entry *entry, struct clip_vcc *clip_vcc) { + struct net_device *dev = n->dev; unsigned long exp; char buf[17]; int svc, llc, off; @@ -809,8 +740,7 @@ static void atmarp_info(struct seq_file *seq, struct net_device *dev, seq_printf(seq, "%-6s%-4s%-4s%5ld ", dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); - off = scnprintf(buf, sizeof(buf) - 1, "%pI4", - &entry->ip); + off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key); while (off < 16) buf[off++] = ' '; buf[off] = '\0'; @@ -881,14 +811,17 @@ static void *clip_seq_sub_iter(struct neigh_seq_state *_state, { struct clip_seq_state *state = (struct clip_seq_state *)_state; - return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos); + if (n->dev->type != ARPHRD_ATM) + return NULL; + + return clip_seq_vcc_walk(state, neighbour_priv(n), pos); } static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { struct clip_seq_state *state = seq->private; state->ns.neigh_sub_iter = clip_seq_sub_iter; - return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY); + return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY); } static int clip_seq_show(struct seq_file *seq, void *v) @@ -900,10 +833,10 @@ static int clip_seq_show(struct seq_file *seq, void *v) seq_puts(seq, atm_arp_banner); } else { struct clip_seq_state *state = seq->private; - struct neighbour *n = v; struct clip_vcc *vcc = state->vcc; + struct neighbour *n = v; - atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc); + atmarp_info(seq, n, neighbour_priv(n), vcc); } return 0; } @@ -934,9 +867,6 @@ static void atm_clip_exit_noproc(void); static int __init atm_clip_init(void) { - neigh_table_init_no_netlink(&clip_tbl); - - clip_tbl_hook = &clip_tbl; register_atm_ioctl(&clip_ioctl_ops); register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); @@ -973,12 +903,6 @@ static void atm_clip_exit_noproc(void) */ del_timer_sync(&idle_timer); - /* Next, purge the table, so that the device - * unregister loop below does not hang due to - * device references remaining in the table. - */ - neigh_ifdown(&clip_tbl, NULL); - dev = clip_devs; while (dev) { next = PRIV(dev)->next; @@ -986,11 +910,6 @@ static void atm_clip_exit_noproc(void) free_netdev(dev); dev = next; } - - /* Now it is safe to fully shutdown whole table. */ - neigh_table_clear(&clip_tbl); - - clip_tbl_hook = NULL; } static void __exit atm_clip_exit(void) diff --git a/net/atm/common.c b/net/atm/common.c index 14ff9fe39989..b4b44dbed645 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -214,6 +214,26 @@ void vcc_release_async(struct atm_vcc *vcc, int reply) } EXPORT_SYMBOL(vcc_release_async); +void vcc_process_recv_queue(struct atm_vcc *vcc) +{ + struct sk_buff_head queue, *rq; + struct sk_buff *skb, *tmp; + unsigned long flags; + + __skb_queue_head_init(&queue); + rq = &sk_atm(vcc)->sk_receive_queue; + + spin_lock_irqsave(&rq->lock, flags); + skb_queue_splice_init(rq, &queue); + spin_unlock_irqrestore(&rq->lock, flags); + + skb_queue_walk_safe(&queue, skb, tmp) { + __skb_unlink(skb, &queue); + vcc->push(vcc, skb); + } +} +EXPORT_SYMBOL(vcc_process_recv_queue); + void atm_dev_signal_change(struct atm_dev *dev, char signal) { pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n", @@ -502,8 +522,11 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (sock->state != SS_CONNECTED) return -ENOTCONN; - if (flags & ~MSG_DONTWAIT) /* only handle MSG_DONTWAIT */ + + /* only handle MSG_DONTWAIT and MSG_PEEK */ + if (flags & ~(MSG_DONTWAIT | MSG_PEEK)) return -EOPNOTSUPP; + vcc = ATM_SD(sock); if (test_bit(ATM_VF_RELEASED, &vcc->flags) || test_bit(ATM_VF_CLOSE, &vcc->flags) || @@ -524,8 +547,13 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); - pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc), skb->truesize); - atm_return(vcc, skb->truesize); + + if (!(flags & MSG_PEEK)) { + pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc), + skb->truesize); + atm_return(vcc, skb->truesize); + } + skb_free_datagram(sk, skb); return copied; } diff --git a/net/atm/common.h b/net/atm/common.h index f48a76b6cdf4..cc3c2dae4d79 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -24,6 +24,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int vcc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); +void vcc_process_recv_queue(struct atm_vcc *vcc); int atmpvc_init(void); void atmpvc_exit(void); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index db4a11c61d15..df35d9a3b5fe 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -303,6 +303,10 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) atmvcc->push = pppoatm_push; atmvcc->pop = pppoatm_pop; __module_get(THIS_MODULE); + + /* re-process everything received between connection setup and + backend setup */ + vcc_process_recv_queue(atmvcc); return 0; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index e7c69f4619ec..b863c1877c80 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -402,14 +402,14 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) break; case AX25_T1: - if (ax25_ctl.arg < 1) + if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ) goto einval_put; ax25->rtt = (ax25_ctl.arg * HZ) / 2; ax25->t1 = ax25_ctl.arg * HZ; break; case AX25_T2: - if (ax25_ctl.arg < 1) + if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ) goto einval_put; ax25->t2 = ax25_ctl.arg * HZ; break; @@ -422,10 +422,15 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) break; case AX25_T3: + if (ax25_ctl.arg > ULONG_MAX / HZ) + goto einval_put; ax25->t3 = ax25_ctl.arg * HZ; break; case AX25_IDLE: + if (ax25_ctl.arg > ULONG_MAX / (60 * HZ)) + goto einval_put; + ax25->idle = ax25_ctl.arg * 60 * HZ; break; @@ -571,7 +576,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case AX25_T1: - if (opt < 1) { + if (opt < 1 || opt > ULONG_MAX / HZ) { res = -EINVAL; break; } @@ -580,7 +585,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case AX25_T2: - if (opt < 1) { + if (opt < 1 || opt > ULONG_MAX / HZ) { res = -EINVAL; break; } @@ -596,7 +601,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case AX25_T3: - if (opt < 1) { + if (opt < 1 || opt > ULONG_MAX / HZ) { res = -EINVAL; break; } @@ -604,7 +609,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; case AX25_IDLE: - if (opt < 0) { + if (opt < 0 || opt > ULONG_MAX / (60 * HZ)) { res = -EINVAL; break; } diff --git a/net/batman-adv/bat_sysfs.c b/net/batman-adv/bat_sysfs.c index b8a7414c3571..c25492f7d665 100644 --- a/net/batman-adv/bat_sysfs.c +++ b/net/batman-adv/bat_sysfs.c @@ -174,7 +174,7 @@ static int store_uint_attr(const char *buff, size_t count, unsigned long uint_val; int ret; - ret = strict_strtoul(buff, 10, &uint_val); + ret = kstrtoul(buff, 10, &uint_val); if (ret) { bat_info(net_dev, "%s: Invalid parameter received: %s\n", @@ -239,7 +239,7 @@ static ssize_t store_vis_mode(struct kobject *kobj, struct attribute *attr, unsigned long val; int ret, vis_mode_tmp = -1; - ret = strict_strtoul(buff, 10, &val); + ret = kstrtoul(buff, 10, &val); if (((count == 2) && (!ret) && (val == VIS_TYPE_CLIENT_UPDATE)) || (strncmp(buff, "client", 6) == 0) || diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 0be9ff346fa0..9bc63b209b3f 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -155,7 +155,7 @@ int bit_get_packet(void *priv, unsigned long *seq_bits, /* sequence number is much newer, probably missed a lot of packets */ if ((seq_num_diff >= TQ_LOCAL_WINDOW_SIZE) - || (seq_num_diff < EXPECTED_SEQNO_RANGE)) { + && (seq_num_diff < EXPECTED_SEQNO_RANGE)) { bat_dbg(DBG_BATMAN, bat_priv, "We missed a lot of packets (%i) !\n", seq_num_diff - 1); diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 619fb73b3b76..9373a143c6d4 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -25,6 +25,7 @@ #include "gateway_common.h" #include "hard-interface.h" #include "originator.h" +#include "translation-table.h" #include "routing.h" #include <linux/ip.h> #include <linux/ipv6.h> @@ -572,108 +573,142 @@ out: return ret; } -int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb, - struct orig_node *old_gw) +bool gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len) { struct ethhdr *ethhdr; struct iphdr *iphdr; struct ipv6hdr *ipv6hdr; struct udphdr *udphdr; - struct gw_node *curr_gw; - struct neigh_node *neigh_curr = NULL, *neigh_old = NULL; - unsigned int header_len = 0; - int ret = 1; - - if (atomic_read(&bat_priv->gw_mode) == GW_MODE_OFF) - return 0; /* check for ethernet header */ - if (!pskb_may_pull(skb, header_len + ETH_HLEN)) - return 0; + if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) + return false; ethhdr = (struct ethhdr *)skb->data; - header_len += ETH_HLEN; + *header_len += ETH_HLEN; /* check for initial vlan header */ if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) { - if (!pskb_may_pull(skb, header_len + VLAN_HLEN)) - return 0; + if (!pskb_may_pull(skb, *header_len + VLAN_HLEN)) + return false; ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); - header_len += VLAN_HLEN; + *header_len += VLAN_HLEN; } /* check for ip header */ switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: - if (!pskb_may_pull(skb, header_len + sizeof(*iphdr))) - return 0; - iphdr = (struct iphdr *)(skb->data + header_len); - header_len += iphdr->ihl * 4; + if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr))) + return false; + iphdr = (struct iphdr *)(skb->data + *header_len); + *header_len += iphdr->ihl * 4; /* check for udp header */ if (iphdr->protocol != IPPROTO_UDP) - return 0; + return false; break; case ETH_P_IPV6: - if (!pskb_may_pull(skb, header_len + sizeof(*ipv6hdr))) - return 0; - ipv6hdr = (struct ipv6hdr *)(skb->data + header_len); - header_len += sizeof(*ipv6hdr); + if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr))) + return false; + ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len); + *header_len += sizeof(*ipv6hdr); /* check for udp header */ if (ipv6hdr->nexthdr != IPPROTO_UDP) - return 0; + return false; break; default: - return 0; + return false; } - if (!pskb_may_pull(skb, header_len + sizeof(*udphdr))) - return 0; - udphdr = (struct udphdr *)(skb->data + header_len); - header_len += sizeof(*udphdr); + if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) + return false; + udphdr = (struct udphdr *)(skb->data + *header_len); + *header_len += sizeof(*udphdr); /* check for bootp port */ if ((ntohs(ethhdr->h_proto) == ETH_P_IP) && (ntohs(udphdr->dest) != 67)) - return 0; + return false; if ((ntohs(ethhdr->h_proto) == ETH_P_IPV6) && (ntohs(udphdr->dest) != 547)) - return 0; + return false; - if (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER) - return -1; + return true; +} - curr_gw = gw_get_selected_gw_node(bat_priv); - if (!curr_gw) - return 0; - - /* If old_gw != NULL then this packet is unicast. - * So, at this point we have to check the message type: if it is a - * DHCPREQUEST we have to decide whether to drop it or not */ - if (old_gw && curr_gw->orig_node != old_gw) { - if (is_type_dhcprequest(skb, header_len)) { - /* If the dhcp packet has been sent to a different gw, - * we have to evaluate whether the old gw is still - * reliable enough */ - neigh_curr = find_router(bat_priv, curr_gw->orig_node, - NULL); - neigh_old = find_router(bat_priv, old_gw, NULL); - if (!neigh_curr || !neigh_old) - goto free_neigh; - if (neigh_curr->tq_avg - neigh_old->tq_avg < - GW_THRESHOLD) - ret = -1; - } +bool gw_out_of_range(struct bat_priv *bat_priv, + struct sk_buff *skb, struct ethhdr *ethhdr) +{ + struct neigh_node *neigh_curr = NULL, *neigh_old = NULL; + struct orig_node *orig_dst_node = NULL; + struct gw_node *curr_gw = NULL; + bool ret, out_of_range = false; + unsigned int header_len = 0; + uint8_t curr_tq_avg; + + ret = gw_is_dhcp_target(skb, &header_len); + if (!ret) + goto out; + + orig_dst_node = transtable_search(bat_priv, ethhdr->h_source, + ethhdr->h_dest); + if (!orig_dst_node) + goto out; + + if (!orig_dst_node->gw_flags) + goto out; + + ret = is_type_dhcprequest(skb, header_len); + if (!ret) + goto out; + + switch (atomic_read(&bat_priv->gw_mode)) { + case GW_MODE_SERVER: + /* If we are a GW then we are our best GW. We can artificially + * set the tq towards ourself as the maximum value */ + curr_tq_avg = TQ_MAX_VALUE; + break; + case GW_MODE_CLIENT: + curr_gw = gw_get_selected_gw_node(bat_priv); + if (!curr_gw) + goto out; + + /* packet is going to our gateway */ + if (curr_gw->orig_node == orig_dst_node) + goto out; + + /* If the dhcp packet has been sent to a different gw, + * we have to evaluate whether the old gw is still + * reliable enough */ + neigh_curr = find_router(bat_priv, curr_gw->orig_node, NULL); + if (!neigh_curr) + goto out; + + curr_tq_avg = neigh_curr->tq_avg; + break; + case GW_MODE_OFF: + default: + goto out; } -free_neigh: + + neigh_old = find_router(bat_priv, orig_dst_node, NULL); + if (!!neigh_old) + goto out; + + if (curr_tq_avg - neigh_old->tq_avg > GW_THRESHOLD) + out_of_range = true; + +out: + if (orig_dst_node) + orig_node_free_ref(orig_dst_node); + if (curr_gw) + gw_node_free_ref(curr_gw); if (neigh_old) neigh_node_free_ref(neigh_old); if (neigh_curr) neigh_node_free_ref(neigh_curr); - if (curr_gw) - gw_node_free_ref(curr_gw); - return ret; + return out_of_range; } diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index b9b983c07feb..e1edba08eb1d 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -31,7 +31,8 @@ void gw_node_update(struct bat_priv *bat_priv, void gw_node_delete(struct bat_priv *bat_priv, struct orig_node *orig_node); void gw_node_purge(struct bat_priv *bat_priv); int gw_client_seq_print_text(struct seq_file *seq, void *offset); -int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb, - struct orig_node *old_gw); +bool gw_is_dhcp_target(struct sk_buff *skb, unsigned int *header_len); +bool gw_out_of_range(struct bat_priv *bat_priv, + struct sk_buff *skb, struct ethhdr *ethhdr); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 18661af0bc3b..c4ac7b0a2a63 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -97,7 +97,7 @@ static bool parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = strict_strtol(buff, 10, &ldown); + ret = kstrtol(buff, 10, &ldown); if (ret) { bat_err(net_dev, "Download speed of gateway mode invalid: %s\n", @@ -122,7 +122,7 @@ static bool parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = strict_strtol(slash_ptr + 1, 10, &lup); + ret = kstrtol(slash_ptr + 1, 10, &lup); if (ret) { bat_err(net_dev, "Upload speed of gateway mode invalid: " diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 2a172505f513..d1da29da333b 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -25,7 +25,7 @@ /* clears the hash */ static void hash_init(struct hashtable_t *hash) { - int i; + uint32_t i; for (i = 0 ; i < hash->size; i++) { INIT_HLIST_HEAD(&hash->table[i]); @@ -42,7 +42,7 @@ void hash_destroy(struct hashtable_t *hash) } /* allocates and clears the hash */ -struct hashtable_t *hash_new(int size) +struct hashtable_t *hash_new(uint32_t size) { struct hashtable_t *hash; diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index d20aa71ba1e8..4768717f07f9 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -33,17 +33,17 @@ typedef int (*hashdata_compare_cb)(const struct hlist_node *, const void *); /* the hashfunction, should return an index * based on the key in the data of the first * argument and the size the second */ -typedef int (*hashdata_choose_cb)(const void *, int); +typedef uint32_t (*hashdata_choose_cb)(const void *, uint32_t); typedef void (*hashdata_free_cb)(struct hlist_node *, void *); struct hashtable_t { struct hlist_head *table; /* the hashtable itself with the buckets */ spinlock_t *list_locks; /* spinlock for each hash list entry */ - int size; /* size of hashtable */ + uint32_t size; /* size of hashtable */ }; /* allocates and clears the hash */ -struct hashtable_t *hash_new(int size); +struct hashtable_t *hash_new(uint32_t size); /* free only the hashtable and the hash itself. */ void hash_destroy(struct hashtable_t *hash); @@ -57,7 +57,7 @@ static inline void hash_delete(struct hashtable_t *hash, struct hlist_head *head; struct hlist_node *node, *node_tmp; spinlock_t *list_lock; /* spinlock to protect write access */ - int i; + uint32_t i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -93,7 +93,8 @@ static inline int hash_add(struct hashtable_t *hash, hashdata_choose_cb choose, const void *data, struct hlist_node *data_node) { - int index, ret = -1; + uint32_t index; + int ret = -1; struct hlist_head *head; struct hlist_node *node; spinlock_t *list_lock; /* spinlock to protect write access */ @@ -137,7 +138,7 @@ static inline void *hash_remove(struct hashtable_t *hash, hashdata_compare_cb compare, hashdata_choose_cb choose, void *data) { - size_t index; + uint32_t index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 964ad4d8ba33..86354e06eb48 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -28,7 +28,7 @@ #define DRIVER_DEVICE "batman-adv" #ifndef SOURCE_VERSION -#define SOURCE_VERSION "2011.4.0" +#define SOURCE_VERSION "2012.0.0" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 0e5b77255d99..0bc2045a2f2e 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -164,7 +164,7 @@ void originator_free(struct bat_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct orig_node *orig_node; - int i; + uint32_t i; if (!hash) return; @@ -350,7 +350,7 @@ static void _purge_orig(struct bat_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct orig_node *orig_node; - int i; + uint32_t i; if (!hash) return; @@ -413,7 +413,8 @@ int orig_seq_print_text(struct seq_file *seq, void *offset) int batman_count = 0; int last_seen_secs; int last_seen_msecs; - int i, ret = 0; + uint32_t i; + int ret = 0; primary_if = primary_if_get_selected(bat_priv); @@ -519,7 +520,8 @@ int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num) struct hlist_node *node; struct hlist_head *head; struct orig_node *orig_node; - int i, ret; + uint32_t i; + int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on * if_num */ @@ -601,7 +603,8 @@ int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num) struct hlist_head *head; struct hard_iface *hard_iface_tmp; struct orig_node *orig_node; - int i, ret; + uint32_t i; + int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on * if_num */ diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index cfc1f60a96a1..67765ffef731 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -42,7 +42,7 @@ int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num); /* hashfunction to choose an entry in a hash table of given size */ /* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ -static inline int choose_orig(const void *data, int32_t size) +static inline uint32_t choose_orig(const void *data, uint32_t size) { const unsigned char *key = data; uint32_t hash = 0; diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f961cc5eade5..ef24a7205f65 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -39,7 +39,7 @@ void slide_own_bcast_window(struct hard_iface *hard_iface) struct hlist_head *head; struct orig_node *orig_node; unsigned long *word; - int i; + uint32_t i; size_t word_index; for (i = 0; i < hash->size; i++) { @@ -578,6 +578,7 @@ int recv_tt_query(struct sk_buff *skb, struct hard_iface *recv_if) { struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct tt_query_packet *tt_query; + uint16_t tt_len; struct ethhdr *ethhdr; /* drop packet if it has not necessary minimum size */ @@ -616,13 +617,22 @@ int recv_tt_query(struct sk_buff *skb, struct hard_iface *recv_if) } break; case TT_RESPONSE: - /* packet needs to be linearized to access the TT changes */ - if (skb_linearize(skb) < 0) - goto out; + if (is_my_mac(tt_query->dst)) { + /* packet needs to be linearized to access the TT + * changes */ + if (skb_linearize(skb) < 0) + goto out; + + tt_len = tt_query->tt_data * sizeof(struct tt_change); + + /* Ensure we have all the claimed data */ + if (unlikely(skb_headlen(skb) < + sizeof(struct tt_query_packet) + + tt_len)) + goto out; - if (is_my_mac(tt_query->dst)) handle_tt_response(bat_priv, tt_query); - else { + } else { bat_dbg(DBG_TT, bat_priv, "Routing TT_RESPONSE to %pM [%c]\n", tt_query->dst, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f9cc95728989..45297c843092 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -563,10 +563,10 @@ static int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) struct bcast_packet *bcast_packet; struct vlan_ethhdr *vhdr; struct softif_neigh *curr_softif_neigh = NULL; - struct orig_node *orig_node = NULL; + unsigned int header_len = 0; int data_len = skb->len, ret; short vid = -1; - bool do_bcast; + bool do_bcast = false; if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE) goto dropped; @@ -598,17 +598,28 @@ static int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) /* Register the client MAC in the transtable */ tt_local_add(soft_iface, ethhdr->h_source, skb->skb_iif); - orig_node = transtable_search(bat_priv, ethhdr->h_source, - ethhdr->h_dest); - do_bcast = is_multicast_ether_addr(ethhdr->h_dest); - if (do_bcast || (orig_node && orig_node->gw_flags)) { - ret = gw_is_target(bat_priv, skb, orig_node); + if (is_multicast_ether_addr(ethhdr->h_dest)) { + do_bcast = true; - if (ret < 0) - goto dropped; - - if (ret) - do_bcast = false; + switch (atomic_read(&bat_priv->gw_mode)) { + case GW_MODE_SERVER: + /* gateway servers should not send dhcp + * requests into the mesh */ + ret = gw_is_dhcp_target(skb, &header_len); + if (ret) + goto dropped; + break; + case GW_MODE_CLIENT: + /* gateway clients should send dhcp requests + * via unicast to their gateway */ + ret = gw_is_dhcp_target(skb, &header_len); + if (ret) + do_bcast = false; + break; + case GW_MODE_OFF: + default: + break; + } } /* ethernet packet should be broadcasted */ @@ -644,6 +655,12 @@ static int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) /* unicast packet */ } else { + if (atomic_read(&bat_priv->gw_mode) != GW_MODE_OFF) { + ret = gw_out_of_range(bat_priv, skb, ethhdr); + if (ret) + goto dropped; + } + ret = unicast_send_skb(skb, bat_priv); if (ret != 0) goto dropped_freed; @@ -662,8 +679,6 @@ end: softif_neigh_free_ref(curr_softif_neigh); if (primary_if) hardif_free_ref(primary_if); - if (orig_node) - orig_node_free_ref(orig_node); return NETDEV_TX_OK; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index c7aafc7c5ed4..78b9528bfc2a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -67,7 +67,7 @@ static struct tt_local_entry *tt_local_hash_find(struct bat_priv *bat_priv, struct hlist_head *head; struct hlist_node *node; struct tt_local_entry *tt_local_entry, *tt_local_entry_tmp = NULL; - int index; + uint32_t index; if (!hash) return NULL; @@ -99,7 +99,7 @@ static struct tt_global_entry *tt_global_hash_find(struct bat_priv *bat_priv, struct hlist_node *node; struct tt_global_entry *tt_global_entry; struct tt_global_entry *tt_global_entry_tmp = NULL; - int index; + uint32_t index; if (!hash) return NULL; @@ -314,9 +314,8 @@ int tt_local_seq_print_text(struct seq_file *seq, void *offset) struct hard_iface *primary_if; struct hlist_node *node; struct hlist_head *head; - size_t buf_size, pos; - char *buff; - int i, ret = 0; + uint32_t i; + int ret = 0; primary_if = primary_if_get_selected(bat_priv); if (!primary_if) { @@ -337,34 +336,13 @@ int tt_local_seq_print_text(struct seq_file *seq, void *offset) "announced via TT (TTVN: %u):\n", net_dev->name, (uint8_t)atomic_read(&bat_priv->ttvn)); - buf_size = 1; - /* Estimate length for: " * xx:xx:xx:xx:xx:xx\n" */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - __hlist_for_each_rcu(node, head) - buf_size += 29; - rcu_read_unlock(); - } - - buff = kmalloc(buf_size, GFP_ATOMIC); - if (!buff) { - ret = -ENOMEM; - goto out; - } - - buff[0] = '\0'; - pos = 0; - for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_local_entry, node, head, hash_entry) { - pos += snprintf(buff + pos, 30, " * %pM " - "[%c%c%c%c%c]\n", + seq_printf(seq, " * %pM [%c%c%c%c%c]\n", tt_local_entry->addr, (tt_local_entry->flags & TT_CLIENT_ROAM ? 'R' : '.'), @@ -379,9 +357,6 @@ int tt_local_seq_print_text(struct seq_file *seq, void *offset) } rcu_read_unlock(); } - - seq_printf(seq, "%s", buff); - kfree(buff); out: if (primary_if) hardif_free_ref(primary_if); @@ -427,7 +402,7 @@ static void tt_local_purge(struct bat_priv *bat_priv) struct hlist_node *node, *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - int i; + uint32_t i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -465,7 +440,7 @@ static void tt_local_table_free(struct bat_priv *bat_priv) struct tt_local_entry *tt_local_entry; struct hlist_node *node, *node_tmp; struct hlist_head *head; - int i; + uint32_t i; if (!bat_priv->tt_local_hash) return; @@ -590,9 +565,8 @@ int tt_global_seq_print_text(struct seq_file *seq, void *offset) struct hard_iface *primary_if; struct hlist_node *node; struct hlist_head *head; - size_t buf_size, pos; - char *buff; - int i, ret = 0; + uint32_t i; + int ret = 0; primary_if = primary_if_get_selected(bat_priv); if (!primary_if) { @@ -615,35 +589,13 @@ int tt_global_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, " %-13s %s %-15s %s %s\n", "Client", "(TTVN)", "Originator", "(Curr TTVN)", "Flags"); - buf_size = 1; - /* Estimate length for: " * xx:xx:xx:xx:xx:xx (ttvn) via - * xx:xx:xx:xx:xx:xx (cur_ttvn)\n"*/ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - __hlist_for_each_rcu(node, head) - buf_size += 67; - rcu_read_unlock(); - } - - buff = kmalloc(buf_size, GFP_ATOMIC); - if (!buff) { - ret = -ENOMEM; - goto out; - } - - buff[0] = '\0'; - pos = 0; - for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(tt_global_entry, node, head, hash_entry) { - pos += snprintf(buff + pos, 69, - " * %pM (%3u) via %pM (%3u) " + seq_printf(seq, " * %pM (%3u) via %pM (%3u) " "[%c%c%c]\n", tt_global_entry->addr, tt_global_entry->ttvn, tt_global_entry->orig_node->orig, @@ -659,9 +611,6 @@ int tt_global_seq_print_text(struct seq_file *seq, void *offset) } rcu_read_unlock(); } - - seq_printf(seq, "%s", buff); - kfree(buff); out: if (primary_if) hardif_free_ref(primary_if); @@ -716,7 +665,7 @@ void tt_global_del_orig(struct bat_priv *bat_priv, struct orig_node *orig_node, const char *message) { struct tt_global_entry *tt_global_entry; - int i; + uint32_t i; struct hashtable_t *hash = bat_priv->tt_global_hash; struct hlist_node *node, *safe; struct hlist_head *head; @@ -735,9 +684,10 @@ void tt_global_del_orig(struct bat_priv *bat_priv, if (tt_global_entry->orig_node == orig_node) { bat_dbg(DBG_TT, bat_priv, "Deleting global tt entry %pM " - "(via %pM): originator time out\n", + "(via %pM): %s\n", tt_global_entry->addr, - tt_global_entry->orig_node->orig); + tt_global_entry->orig_node->orig, + message); hlist_del_rcu(node); tt_global_entry_free_ref(tt_global_entry); } @@ -754,7 +704,7 @@ static void tt_global_roam_purge(struct bat_priv *bat_priv) struct hlist_node *node, *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - int i; + uint32_t i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -788,7 +738,7 @@ static void tt_global_table_free(struct bat_priv *bat_priv) struct tt_global_entry *tt_global_entry; struct hlist_node *node, *node_tmp; struct hlist_head *head; - int i; + uint32_t i; if (!bat_priv->tt_global_hash) return; @@ -874,7 +824,8 @@ uint16_t tt_global_crc(struct bat_priv *bat_priv, struct orig_node *orig_node) struct tt_global_entry *tt_global_entry; struct hlist_node *node; struct hlist_head *head; - int i, j; + uint32_t i; + int j; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -911,7 +862,8 @@ uint16_t tt_local_crc(struct bat_priv *bat_priv) struct tt_local_entry *tt_local_entry; struct hlist_node *node; struct hlist_head *head; - int i, j; + uint32_t i; + int j; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1048,7 +1000,7 @@ static struct sk_buff *tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, struct sk_buff *skb = NULL; uint16_t tt_tot, tt_count; ssize_t tt_query_size = sizeof(struct tt_query_packet); - int i; + uint32_t i; if (tt_query_size + tt_len > primary_if->soft_iface->mtu) { tt_len = primary_if->soft_iface->mtu - tt_query_size; @@ -1187,11 +1139,11 @@ static bool send_other_tt_response(struct bat_priv *bat_priv, (tt_request->flags & TT_FULL_TABLE ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ - req_dst_orig_node = get_orig_node(bat_priv, tt_request->dst); + req_dst_orig_node = orig_hash_find(bat_priv, tt_request->dst); if (!req_dst_orig_node) goto out; - res_dst_orig_node = get_orig_node(bat_priv, tt_request->src); + res_dst_orig_node = orig_hash_find(bat_priv, tt_request->src); if (!res_dst_orig_node) goto out; @@ -1317,7 +1269,7 @@ static bool send_my_tt_response(struct bat_priv *bat_priv, my_ttvn = (uint8_t)atomic_read(&bat_priv->ttvn); req_ttvn = tt_request->ttvn; - orig_node = get_orig_node(bat_priv, tt_request->src); + orig_node = orig_hash_find(bat_priv, tt_request->src); if (!orig_node) goto out; @@ -1725,7 +1677,7 @@ void tt_free(struct bat_priv *bat_priv) * entry */ static void tt_local_reset_flags(struct bat_priv *bat_priv, uint16_t flags) { - int i; + uint32_t i; struct hashtable_t *hash = bat_priv->tt_local_hash; struct hlist_head *head; struct hlist_node *node; @@ -1758,7 +1710,7 @@ static void tt_local_purge_pending_clients(struct bat_priv *bat_priv) struct hlist_node *node, *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - int i; + uint32_t i; if (!hash) return; diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c index f81a6b668b0c..7445413253ca 100644 --- a/net/batman-adv/vis.c +++ b/net/batman-adv/vis.c @@ -66,7 +66,7 @@ static int vis_info_cmp(const struct hlist_node *node, const void *data2) /* hash function to choose an entry in a hash table of given size */ /* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ -static int vis_info_choose(const void *data, int size) +static uint32_t vis_info_choose(const void *data, uint32_t size) { const struct vis_info *vis_info = data; const struct vis_packet *packet; @@ -96,7 +96,7 @@ static struct vis_info *vis_hash_find(struct bat_priv *bat_priv, struct hlist_head *head; struct hlist_node *node; struct vis_info *vis_info, *vis_info_tmp = NULL; - int index; + uint32_t index; if (!hash) return NULL; @@ -202,7 +202,8 @@ int vis_seq_print_text(struct seq_file *seq, void *offset) HLIST_HEAD(vis_if_list); struct if_list_entry *entry; struct hlist_node *pos, *n; - int i, j, ret = 0; + uint32_t i; + int j, ret = 0; int vis_server = atomic_read(&bat_priv->vis_mode); size_t buff_pos, buf_size; char *buff; @@ -556,7 +557,8 @@ static int find_best_vis_server(struct bat_priv *bat_priv, struct hlist_head *head; struct orig_node *orig_node; struct vis_packet *packet; - int best_tq = -1, i; + int best_tq = -1; + uint32_t i; packet = (struct vis_packet *)info->skb_packet->data; @@ -608,7 +610,8 @@ static int generate_vis_packet(struct bat_priv *bat_priv) struct vis_packet *packet = (struct vis_packet *)info->skb_packet->data; struct vis_info_entry *entry; struct tt_local_entry *tt_local_entry; - int best_tq = -1, i; + int best_tq = -1; + uint32_t i; info->first_seen = jiffies; packet->vis_type = atomic_read(&bat_priv->vis_mode); @@ -696,7 +699,7 @@ unlock: * held */ static void purge_vis_packets(struct bat_priv *bat_priv) { - int i; + uint32_t i; struct hashtable_t *hash = bat_priv->vis_hash; struct hlist_node *node, *node_tmp; struct hlist_head *head; @@ -733,7 +736,7 @@ static void broadcast_vis_packet(struct bat_priv *bat_priv, struct sk_buff *skb; struct hard_iface *hard_iface; uint8_t dstaddr[ETH_ALEN]; - int i; + uint32_t i; packet = (struct vis_packet *)info->skb_packet->data; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index feb77ea7b58e..a3754ac262c3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -186,7 +186,8 @@ static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) strcpy(info->bus_info, "N/A"); } -static u32 br_fix_features(struct net_device *dev, u32 features) +static netdev_features_t br_fix_features(struct net_device *dev, + netdev_features_t features) { struct net_bridge *br = netdev_priv(dev); @@ -341,10 +342,10 @@ void br_dev_setup(struct net_device *dev) dev->priv_flags = IFF_EBRIDGE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | - NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | + NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_TX; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | - NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | + NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_TX; br->dev = dev; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c8e7861b88b0..973813e34428 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -556,7 +556,7 @@ skip: return skb->len; } -/* Create new static fdb entry */ +/* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, __u16 state, __u16 flags) { @@ -575,16 +575,21 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, } else { if (flags & NLM_F_EXCL) return -EEXIST; + } + + if (fdb_to_nud(fdb) != state) { + if (state & NUD_PERMANENT) + fdb->is_local = fdb->is_static = 1; + else if (state & NUD_NOARP) { + fdb->is_local = 0; + fdb->is_static = 1; + } else + fdb->is_local = fdb->is_static = 0; - if (flags & NLM_F_REPLACE) - fdb->updated = fdb->used = jiffies; - fdb->is_local = fdb->is_static = 0; + fdb->updated = fdb->used = jiffies; + fdb_notify(fdb, RTM_NEWNEIGH); } - if (state & NUD_PERMANENT) - fdb->is_local = fdb->is_static = 1; - else if (state & NUD_NOARP) - fdb->is_static = 1; return 0; } @@ -627,6 +632,11 @@ int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -EINVAL; } + if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { + pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); + return -EINVAL; + } + p = br_port_get_rtnl(dev); if (p == NULL) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", @@ -634,9 +644,15 @@ int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return -EINVAL; } - spin_lock_bh(&p->br->hash_lock); - err = fdb_add_entry(p, addr, ndm->ndm_state, nlh->nlmsg_flags); - spin_unlock_bh(&p->br->hash_lock); + if (ndm->ndm_flags & NTF_USE) { + rcu_read_lock(); + br_fdb_update(p->br, p, addr); + rcu_read_unlock(); + } else { + spin_lock_bh(&p->br->hash_lock); + err = fdb_add_entry(p, addr, ndm->ndm_state, nlh->nlmsg_flags); + spin_unlock_bh(&p->br->hash_lock); + } return err; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f603e5b0b930..0a942fbccc9a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -296,10 +296,11 @@ int br_min_mtu(const struct net_bridge *br) /* * Recomputes features using slave's features */ -u32 br_features_recompute(struct net_bridge *br, u32 features) +netdev_features_t br_features_recompute(struct net_bridge *br, + netdev_features_t features) { struct net_bridge_port *p; - u32 mask; + netdev_features_t mask; if (list_empty(&br->port_list)) return features; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 995cbe0ac0b2..375417e633c9 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -127,7 +127,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get( { struct br_ip br_dst; - ipv6_addr_copy(&br_dst.u.ip6, dst); + br_dst.u.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); return br_mdb_ip_get(mdb, &br_dst); @@ -154,7 +154,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case htons(ETH_P_IPV6): - ipv6_addr_copy(&ip.u.ip6, &ipv6_hdr(skb)->daddr); + ip.u.ip6 = ipv6_hdr(skb)->daddr; break; #endif default: @@ -474,7 +474,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; - ipv6_addr_copy(&mldq->mld_mca, group); + mldq->mld_mca = *group; /* checksum */ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -783,7 +783,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, if (!ipv6_is_transient_multicast(group)) return 0; - ipv6_addr_copy(&br_group.u.ip6, group); + br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); return br_multicast_add_group(br, port, &br_group); @@ -1344,7 +1344,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, if (!ipv6_is_transient_multicast(group)) return; - ipv6_addr_copy(&br_group.u.ip6, group); + br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_multicast_leave_group(br, port, &br_group); @@ -1458,6 +1458,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, const struct ipv6hdr *ip6h; u8 icmp6_type; u8 nexthdr; + __be16 frag_off; unsigned len; int offset; int err; @@ -1483,7 +1484,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, return -EINVAL; nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr); + offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); if (offset < 0 || nexthdr != IPPROTO_ICMPV6) return 0; @@ -1501,6 +1502,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, __skb_pull(skb2, offset); skb_reset_transport_header(skb2); + skb_postpull_rcsum(skb2, skb_network_header(skb2), + skb_network_header_len(skb2)); icmp6_type = icmp6_hdr(skb2)->icmp6_type; @@ -1770,7 +1773,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) int err = 0; struct net_bridge_mdb_htable *mdb; - spin_lock(&br->multicast_lock); + spin_lock_bh(&br->multicast_lock); if (br->multicast_disabled == !val) goto unlock; @@ -1806,7 +1809,7 @@ rollback: } unlock: - spin_unlock(&br->multicast_lock); + spin_unlock_bh(&br->multicast_lock); return err; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d6ec3720c77e..834dfabb30f9 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) if (!skb->dev) goto free_skb; dst = skb_dst(skb); - neigh = dst_get_neighbour(dst); + neigh = dst_get_neighbour_noref(dst); if (neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e5f9ece3c9a0..a1daf8227ed1 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -18,6 +18,7 @@ #include <net/sock.h> #include "br_private.h" +#include "br_private_stp.h" static inline size_t br_nlmsg_size(void) { @@ -188,6 +189,11 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) p->state = new_state; br_log_state(p); + + spin_lock_bh(&p->br->lock); + br_port_state_selection(p->br); + spin_unlock_bh(&p->br->lock); + br_ifinfo_notify(RTM_NEWLINK, p); return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d7d6fb05411f..4027029aa5e4 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,7 +387,8 @@ extern int br_add_if(struct net_bridge *br, extern int br_del_if(struct net_bridge *br, struct net_device *dev); extern int br_min_mtu(const struct net_bridge *br); -extern u32 br_features_recompute(struct net_bridge *br, u32 features); +extern netdev_features_t br_features_recompute(struct net_bridge *br, + netdev_features_t features); /* br_input.c */ extern int br_handle_frame_finish(struct sk_buff *skb); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index ad0a3f7cf6cc..dd147d78a588 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -399,25 +399,24 @@ void br_port_state_selection(struct net_bridge *br) struct net_bridge_port *p; unsigned int liveports = 0; - /* Don't change port states if userspace is handling STP */ - if (br->stp_enabled == BR_USER_STP) - return; - list_for_each_entry(p, &br->port_list, list) { if (p->state == BR_STATE_DISABLED) continue; - if (p->port_no == br->root_port) { - p->config_pending = 0; - p->topology_change_ack = 0; - br_make_forwarding(p); - } else if (br_is_designated_port(p)) { - del_timer(&p->message_age_timer); - br_make_forwarding(p); - } else { - p->config_pending = 0; - p->topology_change_ack = 0; - br_make_blocking(p); + /* Don't change port states if userspace is handling STP */ + if (br->stp_enabled != BR_USER_STP) { + if (p->port_no == br->root_port) { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_forwarding(p); + } else if (br_is_designated_port(p)) { + del_timer(&p->message_age_timer); + br_make_forwarding(p); + } else { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_blocking(p); + } } if (p->state == BR_STATE_FORWARDING) diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 2ed0056a39a8..99c85668f551 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -55,9 +55,10 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; + __be16 frag_off; int offset_ph; - offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr); + offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off); if (offset_ph == -1) return false; if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 6e5a8bb9b940..88d7d1d1cb1b 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -113,6 +113,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum, const struct ipv6hdr *ih; struct ipv6hdr _iph; uint8_t nexthdr; + __be16 frag_off; int offset_ph; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); @@ -123,7 +124,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum, printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d", &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr); nexthdr = ih->nexthdr; - offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr); + offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off); if (offset_ph == -1) goto out; print_ports(skb, nexthdr, offset_ph); diff --git a/net/caif/Kconfig b/net/caif/Kconfig index 529750da9624..936361e5a2b6 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -40,3 +40,14 @@ config CAIF_NETDEV If you select to build it as a built-in then the main CAIF device must also be a built-in. If unsure say Y. + +config CAIF_USB + tristate "CAIF USB support" + depends on CAIF + default n + ---help--- + Say Y if you are using CAIF over USB CDC NCM. + This can be either built-in or a loadable module, + If you select to build it as a built-in then the main CAIF device must + also be a built-in. + If unsure say N. diff --git a/net/caif/Makefile b/net/caif/Makefile index ebcd4e7e6f47..cc2b51154d03 100644 --- a/net/caif/Makefile +++ b/net/caif/Makefile @@ -10,5 +10,6 @@ caif-y := caif_dev.o \ obj-$(CONFIG_CAIF) += caif.o obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o obj-$(CONFIG_CAIF) += caif_socket.o +obj-$(CONFIG_CAIF_USB) += caif_usb.o export-y := caif.o diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index f1fa1f6e658d..9b298c14028d 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -17,6 +17,7 @@ #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> +#include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> @@ -24,6 +25,7 @@ #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> +#include <net/caif/cfserl.h> MODULE_LICENSE("GPL"); @@ -33,6 +35,10 @@ struct caif_device_entry { struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; + spinlock_t flow_lock; + struct sk_buff *xoff_skb; + void (*xoff_skb_dtor)(struct sk_buff *skb); + bool xoff; }; struct caif_device_entry_list { @@ -47,13 +53,15 @@ struct caif_net { }; static int caif_net_id; +static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; BUG_ON(!net); caifn = net_generic(net, caif_net_id); - BUG_ON(!caifn); + if (!caifn) + return NULL; return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); @@ -63,7 +71,8 @@ static struct caif_device_entry_list *caif_device_list(struct net *net) struct caif_net *caifn; BUG_ON(!net); caifn = net_generic(net, caif_net_id); - BUG_ON(!caifn); + if (!caifn) + return NULL; return &caifn->caifdevs; } @@ -92,7 +101,8 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) struct caif_device_entry *caifd; caifdevs = caif_device_list(dev_net(dev)); - BUG_ON(!caifdevs); + if (!caifdevs) + return NULL; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) @@ -112,7 +122,9 @@ static struct caif_device_entry *caif_get(struct net_device *dev) struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - BUG_ON(!caifdevs); + if (!caifdevs) + return NULL; + list_for_each_entry_rcu(caifd, &caifdevs->list, list) { if (caifd->netdev == dev) return caifd; @@ -120,15 +132,106 @@ static struct caif_device_entry *caif_get(struct net_device *dev) return NULL; } +void caif_flow_cb(struct sk_buff *skb) +{ + struct caif_device_entry *caifd; + void (*dtor)(struct sk_buff *skb) = NULL; + bool send_xoff; + + WARN_ON(skb->dev == NULL); + + rcu_read_lock(); + caifd = caif_get(skb->dev); + caifd_hold(caifd); + rcu_read_unlock(); + + spin_lock_bh(&caifd->flow_lock); + send_xoff = caifd->xoff; + caifd->xoff = 0; + if (!WARN_ON(caifd->xoff_skb_dtor == NULL)) { + WARN_ON(caifd->xoff_skb != skb); + dtor = caifd->xoff_skb_dtor; + caifd->xoff_skb = NULL; + caifd->xoff_skb_dtor = NULL; + } + spin_unlock_bh(&caifd->flow_lock); + + if (dtor) + dtor(skb); + + if (send_xoff) + caifd->layer.up-> + ctrlcmd(caifd->layer.up, + _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, + caifd->layer.id); + caifd_put(caifd); +} + static int transmit(struct cflayer *layer, struct cfpkt *pkt) { - int err; + int err, high = 0, qlen = 0; + struct caif_dev_common *caifdev; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; + struct netdev_queue *txq; + + rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_CAIF); + caifdev = netdev_priv(caifd->netdev); + + /* Check if we need to handle xoff */ + if (likely(caifd->netdev->tx_queue_len == 0)) + goto noxoff; + + if (unlikely(caifd->xoff)) + goto noxoff; + + if (likely(!netif_queue_stopped(caifd->netdev))) { + /* If we run with a TX queue, check if the queue is too long*/ + txq = netdev_get_tx_queue(skb->dev, 0); + qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc)); + + if (likely(qlen == 0)) + goto noxoff; + + high = (caifd->netdev->tx_queue_len * q_high) / 100; + if (likely(qlen < high)) + goto noxoff; + } + + /* Hold lock while accessing xoff */ + spin_lock_bh(&caifd->flow_lock); + if (caifd->xoff) { + spin_unlock_bh(&caifd->flow_lock); + goto noxoff; + } + + /* + * Handle flow off, we do this by temporary hi-jacking this + * skb's destructor function, and replace it with our own + * flow-on callback. The callback will set flow-on and call + * the original destructor. + */ + + pr_debug("queue has stopped(%d) or is full (%d > %d)\n", + netif_queue_stopped(caifd->netdev), + qlen, high); + caifd->xoff = 1; + caifd->xoff_skb = skb; + caifd->xoff_skb_dtor = skb->destructor; + skb->destructor = caif_flow_cb; + spin_unlock_bh(&caifd->flow_lock); + + caifd->layer.up->ctrlcmd(caifd->layer.up, + _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, + caifd->layer.id); +noxoff: + rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) @@ -172,7 +275,10 @@ static int receive(struct sk_buff *skb, struct net_device *dev, /* Release reference to stack upwards */ caifd_put(caifd); - return 0; + + if (err != 0) + err = NET_RX_DROP; + return err; } static struct packet_type caif_packet_type __read_mostly = { @@ -203,6 +309,57 @@ static void dev_flowctrl(struct net_device *dev, int on) caifd_put(caifd); } +void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, + struct cflayer *link_support, int head_room, + struct cflayer **layer, int (**rcv_func)( + struct sk_buff *, struct net_device *, + struct packet_type *, struct net_device *)) +{ + struct caif_device_entry *caifd; + enum cfcnfg_phy_preference pref; + struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); + struct caif_device_entry_list *caifdevs; + + caifdevs = caif_device_list(dev_net(dev)); + if (!cfg || !caifdevs) + return; + caifd = caif_device_alloc(dev); + if (!caifd) + return; + *layer = &caifd->layer; + spin_lock_init(&caifd->flow_lock); + + switch (caifdev->link_select) { + case CAIF_LINK_HIGH_BANDW: + pref = CFPHYPREF_HIGH_BW; + break; + case CAIF_LINK_LOW_LATENCY: + pref = CFPHYPREF_LOW_LAT; + break; + default: + pref = CFPHYPREF_HIGH_BW; + break; + } + mutex_lock(&caifdevs->lock); + list_add_rcu(&caifd->list, &caifdevs->list); + + strncpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name) - 1); + caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + caifd->layer.transmit = transmit; + cfcnfg_add_phy_layer(cfg, + dev, + &caifd->layer, + pref, + link_support, + caifdev->use_fcs, + head_room); + mutex_unlock(&caifdevs->lock); + if (rcv_func) + *rcv_func = receive; +} +EXPORT_SYMBOL(caif_enroll_dev); + /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *arg) @@ -210,62 +367,40 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, struct net_device *dev = arg; struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; - enum cfcnfg_phy_preference pref; - enum cfcnfg_phy_type phy_type; struct cfcnfg *cfg; + struct cflayer *layer, *link_support; + int head_room = 0; struct caif_device_entry_list *caifdevs; - if (dev->type != ARPHRD_CAIF) - return 0; - cfg = get_cfcnfg(dev_net(dev)); - if (cfg == NULL) + caifdevs = caif_device_list(dev_net(dev)); + if (!cfg || !caifdevs) return 0; - caifdevs = caif_device_list(dev_net(dev)); + caifd = caif_get(dev); + if (caifd == NULL && dev->type != ARPHRD_CAIF) + return 0; switch (what) { case NETDEV_REGISTER: - caifd = caif_device_alloc(dev); - if (!caifd) - return 0; + if (caifd != NULL) + break; caifdev = netdev_priv(dev); - caifdev->flowctrl = dev_flowctrl; - - caifd->layer.transmit = transmit; - if (caifdev->use_frag) - phy_type = CFPHYTYPE_FRAG; - else - phy_type = CFPHYTYPE_CAIF; - - switch (caifdev->link_select) { - case CAIF_LINK_HIGH_BANDW: - pref = CFPHYPREF_HIGH_BW; - break; - case CAIF_LINK_LOW_LATENCY: - pref = CFPHYPREF_LOW_LAT; - break; - default: - pref = CFPHYPREF_HIGH_BW; - break; + link_support = NULL; + if (caifdev->use_frag) { + head_room = 1; + link_support = cfserl_create(dev->ifindex, + caifdev->use_stx); + if (!link_support) { + pr_warn("Out of memory\n"); + break; + } } - strncpy(caifd->layer.name, dev->name, - sizeof(caifd->layer.name) - 1); - caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; - - mutex_lock(&caifdevs->lock); - list_add_rcu(&caifd->list, &caifdevs->list); - - cfcnfg_add_phy_layer(cfg, - phy_type, - dev, - &caifd->layer, - pref, - caifdev->use_fcs, - caifdev->use_stx); - mutex_unlock(&caifdevs->lock); + caif_enroll_dev(dev, caifdev, link_support, head_room, + &layer, NULL); + caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: @@ -277,6 +412,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; } + caifd->xoff = 0; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); @@ -298,6 +434,24 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); + + spin_lock_bh(&caifd->flow_lock); + + /* + * Replace our xoff-destructor with original destructor. + * We trust that skb->destructor *always* is called before + * the skb reference is invalid. The hijacked SKB destructor + * takes the flow_lock so manipulating the skb->destructor here + * should be safe. + */ + if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) + caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; + + caifd->xoff = 0; + caifd->xoff_skb_dtor = NULL; + caifd->xoff_skb = NULL; + + spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; @@ -371,17 +525,14 @@ static void caif_exit_net(struct net *net) struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); - struct cfcnfg *cfg; + struct cfcnfg *cfg = get_cfcnfg(net); + + if (!cfg || !caifdevs) + return; rtnl_lock(); mutex_lock(&caifdevs->lock); - cfg = get_cfcnfg(net); - if (cfg == NULL) { - mutex_unlock(&caifdevs->lock); - return; - } - list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c new file mode 100644 index 000000000000..f5db57c58081 --- /dev/null +++ b/net/caif/caif_usb.c @@ -0,0 +1,208 @@ +/* + * CAIF USB handler + * Copyright (C) ST-Ericsson AB 2011 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/mii.h> +#include <linux/usb.h> +#include <linux/usb/usbnet.h> +#include <net/netns/generic.h> +#include <net/caif/caif_dev.h> +#include <net/caif/caif_layer.h> +#include <net/caif/cfpkt.h> +#include <net/caif/cfcnfg.h> + +MODULE_LICENSE("GPL"); + +#define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */ +#define CFUSB_ALIGNMENT 4 /* Number of bytes to align. */ +#define CFUSB_MAX_HEADLEN (CFUSB_PAD_DESCR_SZ + CFUSB_ALIGNMENT-1) +#define STE_USB_VID 0x04cc /* USB Product ID for ST-Ericsson */ +#define STE_USB_PID_CAIF 0x2306 /* Product id for CAIF Modems */ + +struct cfusbl { + struct cflayer layer; + u8 tx_eth_hdr[ETH_HLEN]; +}; + +static bool pack_added; + +static int cfusbl_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 hpad; + + /* Remove padding. */ + cfpkt_extr_head(pkt, &hpad, 1); + cfpkt_extr_head(pkt, NULL, hpad); + return layr->up->receive(layr->up, pkt); +} + +static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + struct caif_payload_info *info; + u8 hpad; + u8 zeros[CFUSB_ALIGNMENT]; + struct sk_buff *skb; + struct cfusbl *usbl = container_of(layr, struct cfusbl, layer); + + skb = cfpkt_tonative(pkt); + + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IP); + + info = cfpkt_info(pkt); + hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1); + + if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) { + pr_warn("Headroom to small\n"); + kfree_skb(skb); + return -EIO; + } + memset(zeros, 0, hpad); + + cfpkt_add_head(pkt, zeros, hpad); + cfpkt_add_head(pkt, &hpad, 1); + cfpkt_add_head(pkt, usbl->tx_eth_hdr, sizeof(usbl->tx_eth_hdr)); + return layr->dn->transmit(layr->dn, pkt); +} + +static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + if (layr->up && layr->up->ctrlcmd) + layr->up->ctrlcmd(layr->up, ctrl, layr->id); +} + +struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], + u8 braddr[ETH_ALEN]) +{ + struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); + + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfusbl, layer) == 0); + + memset(this, 0, sizeof(struct cflayer)); + this->layer.receive = cfusbl_receive; + this->layer.transmit = cfusbl_transmit; + this->layer.ctrlcmd = cfusbl_ctrlcmd; + snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "usb%d", phyid); + this->layer.id = phyid; + + /* + * Construct TX ethernet header: + * 0-5 destination address + * 5-11 source address + * 12-13 protocol type + */ + memcpy(&this->tx_eth_hdr[ETH_ALEN], braddr, ETH_ALEN); + memcpy(&this->tx_eth_hdr[ETH_ALEN], ethaddr, ETH_ALEN); + this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff; + this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff; + pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n", + this->tx_eth_hdr, this->tx_eth_hdr + ETH_ALEN, + this->tx_eth_hdr[12], this->tx_eth_hdr[13]); + + return (struct cflayer *) this; +} + +static struct packet_type caif_usb_type __read_mostly = { + .type = cpu_to_be16(ETH_P_802_EX1), +}; + +static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, + void *arg) +{ + struct net_device *dev = arg; + struct caif_dev_common common; + struct cflayer *layer, *link_support; + struct usbnet *usbnet = netdev_priv(dev); + struct usb_device *usbdev = usbnet->udev; + struct ethtool_drvinfo drvinfo; + + /* + * Quirks: High-jack ethtool to find if we have a NCM device, + * and find it's VID/PID. + */ + if (dev->ethtool_ops == NULL || dev->ethtool_ops->get_drvinfo == NULL) + return 0; + + dev->ethtool_ops->get_drvinfo(dev, &drvinfo); + if (strncmp(drvinfo.driver, "cdc_ncm", 7) != 0) + return 0; + + pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n", + le16_to_cpu(usbdev->descriptor.idVendor), + le16_to_cpu(usbdev->descriptor.idProduct)); + + /* Check for VID/PID that supports CAIF */ + if (!(le16_to_cpu(usbdev->descriptor.idVendor) == STE_USB_VID && + le16_to_cpu(usbdev->descriptor.idProduct) == STE_USB_PID_CAIF)) + return 0; + + if (what == NETDEV_UNREGISTER) + module_put(THIS_MODULE); + + if (what != NETDEV_REGISTER) + return 0; + + __module_get(THIS_MODULE); + + memset(&common, 0, sizeof(common)); + common.use_frag = false; + common.use_fcs = false; + common.use_stx = false; + common.link_select = CAIF_LINK_HIGH_BANDW; + common.flowctrl = NULL; + + link_support = cfusbl_create(dev->ifindex, dev->dev_addr, + dev->broadcast); + + if (!link_support) + return -ENOMEM; + + if (dev->num_tx_queues > 1) + pr_warn("USB device uses more than one tx queue\n"); + + caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN, + &layer, &caif_usb_type.func); + if (!pack_added) + dev_add_pack(&caif_usb_type); + pack_added = 1; + + strncpy(layer->name, dev->name, + sizeof(layer->name) - 1); + layer->name[sizeof(layer->name) - 1] = 0; + + return 0; +} + +static struct notifier_block caif_device_notifier = { + .notifier_call = cfusbl_device_notify, + .priority = 0, +}; + +static int __init cfusbl_init(void) +{ + return register_netdevice_notifier(&caif_device_notifier); +} + +static void __exit cfusbl_exit(void) +{ + unregister_netdevice_notifier(&caif_device_notifier); + dev_remove_pack(&caif_usb_type); +} + +module_init(cfusbl_init); +module_exit(cfusbl_exit); diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 00523ecc4ced..598aafb4cb51 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -45,8 +45,8 @@ struct cfcnfg_phyinfo { /* Interface index */ int ifindex; - /* Use Start of frame extension */ - bool use_stx; + /* Protocol head room added for CAIF link layer */ + int head_room; /* Use Start of frame checksum */ bool use_fcs; @@ -187,11 +187,11 @@ int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); + cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); - cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); @@ -350,9 +350,7 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, *ifindex = phy->ifindex; *proto_tail = 2; - *proto_head = - - protohead[param.linktype] + (phy->use_stx ? 1 : 0); + *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); @@ -460,13 +458,13 @@ unlock: } void -cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, +cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, - bool fcs, bool stx) + struct cflayer *link_support, + bool fcs, int head_room) { struct cflayer *frml; - struct cflayer *phy_driver = NULL; struct cfcnfg_phyinfo *phyinfo = NULL; int i; u8 phyid; @@ -482,26 +480,13 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); - goto out_err; + goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) goto out_err; - switch (phy_type) { - case CFPHYTYPE_FRAG: - phy_driver = - cfserl_create(CFPHYTYPE_FRAG, phyid, stx); - if (!phy_driver) - goto out_err; - break; - case CFPHYTYPE_CAIF: - phy_driver = NULL; - break; - default: - goto out_err; - } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; @@ -509,7 +494,7 @@ got_phyid: phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; - phyinfo->use_stx = stx; + phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); @@ -519,23 +504,23 @@ got_phyid: phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); - if (phy_driver != NULL) { - phy_driver->id = phyid; - layer_set_dn(frml, phy_driver); - layer_set_up(phy_driver, frml); - layer_set_dn(phy_driver, phy_layer); - layer_set_up(phy_layer, phy_driver); + if (link_support != NULL) { + link_support->id = phyid; + layer_set_dn(frml, link_support); + layer_set_up(link_support, frml); + layer_set_dn(link_support, phy_layer); + layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); +out: mutex_unlock(&cnfg->lock); return; out_err: - kfree(phy_driver); kfree(phyinfo); mutex_unlock(&cnfg->lock); } diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index f39921171d0d..d3ca87bf23b7 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -136,20 +136,21 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) { - int tmp; u16 chks; u16 len; + __le16 data; + struct cffrml *this = container_obj(layr); if (this->dofcs) { chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); - tmp = cpu_to_le16(chks); - cfpkt_add_trail(pkt, &tmp, 2); + data = cpu_to_le16(chks); + cfpkt_add_trail(pkt, &data, 2); } else { cfpkt_pad_trail(pkt, 2); } len = cfpkt_getlen(pkt); - tmp = cpu_to_le16(len); - cfpkt_add_head(pkt, &tmp, 2); + data = cpu_to_le16(len); + cfpkt_add_head(pkt, &data, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { pr_err("Packet is erroneous!\n"); diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index df08c47183d4..e335ba859b97 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -63,7 +63,6 @@ static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) return (struct cfpkt *) skb; } - struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) { struct cfpkt *pkt = skb_to_pkt(nativepkt); @@ -105,14 +104,12 @@ void cfpkt_destroy(struct cfpkt *pkt) kfree_skb(skb); } - inline bool cfpkt_more(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len > 0; } - int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); @@ -144,9 +141,11 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) } from = skb_pull(skb, len); from -= len; - memcpy(data, from, len); + if (data) + memcpy(data, from, len); return 0; } +EXPORT_SYMBOL(cfpkt_extr_head); int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) { @@ -170,13 +169,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) return 0; } - int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) { return cfpkt_add_body(pkt, NULL, len); } - int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); @@ -255,21 +252,19 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) memcpy(to, data, len); return 0; } - +EXPORT_SYMBOL(cfpkt_add_head); inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) { return cfpkt_add_body(pkt, data, len); } - inline u16 cfpkt_getlen(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len; } - inline u16 cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16, void *, u16), u16 data) @@ -287,7 +282,6 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt, return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); } - int cfpkt_setlen(struct cfpkt *pkt, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); @@ -399,3 +393,4 @@ struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) { return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; } +EXPORT_SYMBOL(cfpkt_info); diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index 797c8d165993..8e68b97f13ee 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -31,7 +31,7 @@ static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); -struct cflayer *cfserl_create(int type, int instance, bool use_stx) +struct cflayer *cfserl_create(int instance, bool use_stx) { struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC); if (!this) @@ -40,7 +40,6 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx) this->layer.receive = cfserl_receive; this->layer.transmit = cfserl_transmit; this->layer.ctrlcmd = cfserl_ctrlcmd; - this->layer.type = type; this->usestx = use_stx; spin_lock_init(&this->sync); snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1"); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 733e46008b89..f4f3f58f5234 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -244,7 +244,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ceph_pagelist_init(req->r_trail); } /* create request message; allow space for oid */ - msg_size += 40; + msg_size += MAX_OBJ_NAME_SIZE; if (snapc) msg_size += sizeof(u64) * snapc->num_snaps; if (use_mempool) diff --git a/net/core/Makefile b/net/core/Makefile index 0d357b1c4e57..c4ecc864020f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o net_namespace.o secure_seq.o + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o diff --git a/net/core/dev.c b/net/core/dev.c index 6ba50a1e404c..f494675471a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,10 +133,9 @@ #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> -#include <linux/if_tunnel.h> -#include <linux/if_pppox.h> -#include <linux/ppp_defs.h> #include <linux/net_tstamp.h> +#include <linux/jump_label.h> +#include <net/flow_keys.h> #include "net-sysfs.h" @@ -1320,8 +1319,6 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - u32 flags; - /* * If we're trying to disable lro on a vlan device * use the underlying physical device instead @@ -1329,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); - if (dev->ethtool_ops && dev->ethtool_ops->get_flags) - flags = dev->ethtool_ops->get_flags(dev); - else - flags = ethtool_op_get_flags(dev); + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); - if (!(flags & ETH_FLAG_LRO)) - return; - - __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); } @@ -1396,7 +1387,7 @@ rollback: for_each_net(net) { for_each_netdev(net, dev) { if (dev == last) - break; + goto outroll; if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); @@ -1407,6 +1398,7 @@ rollback: } } +outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1449,34 +1441,55 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct jump_label_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call jump_label_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * jump_label_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif void net_enable_timestamp(void) { - atomic_inc(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + + if (deferred) { + while (--deferred) + jump_label_dec(&netstamp_needed); + return; + } +#endif + WARN_ON(in_interrupt()); + jump_label_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { - atomic_dec(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + if (in_interrupt()) { + atomic_inc(&netstamp_needed_deferred); + return; + } +#endif + jump_label_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { - if (atomic_read(&netstamp_needed)) + skb->tstamp.tv64 = 0; + if (static_branch(&netstamp_needed)) __net_timestamp(skb); - else - skb->tstamp.tv64 = 0; } -static inline void net_timestamp_check(struct sk_buff *skb) -{ - if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) - __net_timestamp(skb); -} +#define net_timestamp_check(COND, SKB) \ + if (static_branch(&netstamp_needed)) { \ + if ((COND) && !(SKB)->tstamp.tv64) \ + __net_timestamp(SKB); \ + } \ static int net_hwtstamp_validate(struct ifreq *ifr) { @@ -1923,7 +1936,8 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; @@ -1953,9 +1967,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", - info.driver, dev ? dev->features : 0L, - skb->sk ? skb->sk->sk_route_caps : 0L, + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", + info.driver, dev ? &dev->features : NULL, + skb->sk ? &skb->sk->sk_route_caps : NULL, skb->len, skb->data_len, skb->ip_summed); if (skb_header_cloned(skb) && @@ -2064,7 +2078,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb, int features) +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs; @@ -2103,7 +2117,7 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } -static bool can_checksum_protocol(unsigned long features, __be16 protocol) +static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { return ((features & NETIF_F_GEN_CSUM) || ((features & NETIF_F_V4_CSUM) && @@ -2114,7 +2128,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) protocol == htons(ETH_P_FCOE))); } -static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +static netdev_features_t harmonize_features(struct sk_buff *skb, + __be16 protocol, netdev_features_t features) { if (!can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; @@ -2126,10 +2141,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features return features; } -u32 netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - u32 features = skb->dev->features; + netdev_features_t features = skb->dev->features; if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2175,7 +2190,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int skb_len; if (likely(!skb->next)) { - u32 features; + netdev_features_t features; /* * If device doesn't need skb->dst, release it right now while @@ -2256,7 +2271,7 @@ gso: return rc; } txq_trans_update(txq); - if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) + if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -2456,6 +2471,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +static void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + + if ((!skb->priority) && (skb->sk) && map) + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +#else +#define skb_update_prio(skb) +#endif + static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 @@ -2496,6 +2523,8 @@ int dev_queue_xmit(struct sk_buff *skb) */ rcu_read_lock_bh(); + skb_update_prio(skb); + txq = dev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); @@ -2530,7 +2559,7 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, cpu); - if (!netif_tx_queue_stopped(txq)) { + if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); @@ -2591,123 +2620,28 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ void __skb_get_rxhash(struct sk_buff *skb) { - int nhoff, hash = 0, poff; - const struct ipv6hdr *ip6; - const struct iphdr *ip; - const struct vlan_hdr *vlan; - u8 ip_proto; - u32 addr1, addr2; - u16 proto; - union { - u32 v32; - u16 v16[2]; - } ports; - - nhoff = skb_network_offset(skb); - proto = skb->protocol; - -again: - switch (proto) { - case __constant_htons(ETH_P_IP): -ip: - if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) - goto done; - - ip = (const struct iphdr *) (skb->data + nhoff); - if (ip_is_fragment(ip)) - ip_proto = 0; - else - ip_proto = ip->protocol; - addr1 = (__force u32) ip->saddr; - addr2 = (__force u32) ip->daddr; - nhoff += ip->ihl * 4; - break; - case __constant_htons(ETH_P_IPV6): -ipv6: - if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) - goto done; - - ip6 = (const struct ipv6hdr *) (skb->data + nhoff); - ip_proto = ip6->nexthdr; - addr1 = (__force u32) ip6->saddr.s6_addr32[3]; - addr2 = (__force u32) ip6->daddr.s6_addr32[3]; - nhoff += 40; - break; - case __constant_htons(ETH_P_8021Q): - if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) - goto done; - vlan = (const struct vlan_hdr *) (skb->data + nhoff); - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); - goto again; - case __constant_htons(ETH_P_PPP_SES): - if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) - goto done; - proto = *((__be16 *) (skb->data + nhoff + - sizeof(struct pppoe_hdr))); - nhoff += PPPOE_SES_HLEN; - switch (proto) { - case __constant_htons(PPP_IP): - goto ip; - case __constant_htons(PPP_IPV6): - goto ipv6; - default: - goto done; - } - default: - goto done; - } - - switch (ip_proto) { - case IPPROTO_GRE: - if (pskb_may_pull(skb, nhoff + 16)) { - u8 *h = skb->data + nhoff; - __be16 flags = *(__be16 *)h; + struct flow_keys keys; + u32 hash; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (!(flags & (GRE_VERSION|GRE_ROUTING))) { - proto = *(__be16 *)(h + 2); - nhoff += 4; - if (flags & GRE_CSUM) - nhoff += 4; - if (flags & GRE_KEY) - nhoff += 4; - if (flags & GRE_SEQ) - nhoff += 4; - goto again; - } - } - break; - case IPPROTO_IPIP: - goto again; - default: - break; - } + if (!skb_flow_dissect(skb, &keys)) + return; - ports.v32 = 0; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - nhoff += poff; - if (pskb_may_pull(skb, nhoff + 4)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff); - if (ports.v16[1] < ports.v16[0]) - swap(ports.v16[0], ports.v16[1]); - skb->l4_rxhash = 1; - } + if (keys.ports) { + if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) + swap(keys.port16[0], keys.port16[1]); + skb->l4_rxhash = 1; } /* get a consistent hash (same value on both flow directions) */ - if (addr2 < addr1) - swap(addr1, addr2); + if ((__force u32)keys.dst < (__force u32)keys.src) + swap(keys.dst, keys.src); - hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, hashrnd); if (!hash) hash = 1; -done: skb->rxhash = hash; } EXPORT_SYMBOL(__skb_get_rxhash); @@ -2718,6 +2652,8 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +struct jump_label_key rps_needed __read_mostly; + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -2997,12 +2933,11 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3017,14 +2952,13 @@ int netif_rx(struct sk_buff *skb) rcu_read_unlock(); preempt_enable(); - } -#else + } else +#endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } -#endif return ret; } EXPORT_SYMBOL(netif_rx); @@ -3230,8 +3164,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); @@ -3362,14 +3295,13 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; @@ -3380,16 +3312,12 @@ int netif_receive_skb(struct sk_buff *skb) if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - } else { - rcu_read_unlock(); - ret = __netif_receive_skb(skb); + return ret; } - - return ret; + rcu_read_unlock(); } -#else - return __netif_receive_skb(skb); #endif + return __netif_receive_skb(skb); } EXPORT_SYMBOL(netif_receive_skb); @@ -4282,6 +4210,12 @@ static int dev_seq_open(struct inode *inode, struct file *file) sizeof(struct dev_iter_state)); } +int dev_seq_open_ops(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state)); +} + static const struct file_operations dev_seq_fops = { .owner = THIS_MODULE, .open = dev_seq_open, @@ -4532,7 +4466,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; uid_t uid; gid_t gid; @@ -4589,7 +4523,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) */ int dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc); @@ -4616,7 +4550,7 @@ EXPORT_SYMBOL(dev_set_promiscuity); int dev_set_allmulti(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; ASSERT_RTNL(); @@ -4719,7 +4653,7 @@ EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int old_flags = dev->flags; + unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); @@ -4802,10 +4736,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; - int old_flags = dev->flags; + int ret; + unsigned int changes, old_flags = dev->flags; ret = __dev_change_flags(dev, flags); if (ret < 0) @@ -5362,7 +5296,8 @@ static void rollback_registered(struct net_device *dev) list_del(&single); } -static u32 netdev_fix_features(struct net_device *dev, u32 features) +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && @@ -5371,12 +5306,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features) features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - if ((features & NETIF_F_NO_CSUM) && - (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_warn(dev, "mixed no checksumming and other settings.\n"); - features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); - } - /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { @@ -5424,7 +5353,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features) int __netdev_update_features(struct net_device *dev) { - u32 features; + netdev_features_t features; int err = 0; ASSERT_RTNL(); @@ -5440,16 +5369,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->features == features) return 0; - netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", - dev->features, features); + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); if (unlikely(err < 0)) { netdev_err(dev, - "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", - err, features, dev->features); + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); return -1; } @@ -5548,6 +5477,9 @@ static void netdev_init_one_queue(struct net_device *dev, queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5633,11 +5565,12 @@ int register_netdevice(struct net_device *dev) dev->wanted_features = dev->features & dev->hw_features; /* Turn on no cache copy if HW is doing checksum */ - dev->hw_features |= NETIF_F_NOCACHE_COPY; - if ((dev->features & NETIF_F_ALL_CSUM) && - !(dev->features & NETIF_F_NO_CSUM)) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; + if (!(dev->flags & IFF_LOOPBACK)) { + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if (dev->features & NETIF_F_ALL_CSUM) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -6373,7 +6306,8 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -u32 netdev_increment_features(u32 all, u32 one, u32 mask) +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_GEN_CSUM) mask |= NETIF_F_ALL_CSUM; @@ -6382,10 +6316,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask) all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; - /* If device needs checksumming, downgrade to it. */ - if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) - all &= ~NETIF_F_NO_CSUM; - /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_GEN_CSUM) all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 277faef9148d..febba516db62 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -696,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = { static int dev_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); + return dev_seq_open_ops(inode, file, &dev_mc_seq_ops); } static const struct file_operations dev_mc_seq_fops = { diff --git a/net/core/dst.c b/net/core/dst.c index d5e2c4c09107..43d94cedbf7c 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -366,7 +366,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, dev_hold(dst->dev); dev_put(dev); rcu_read_lock(); - neigh = dst_get_neighbour(dst); + neigh = dst_get_neighbour_noref(dst); if (neigh && neigh->dev == dev) { neigh->dev = dst->dev; dev_hold(dst->dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f44481707124..31b0b7f5383e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_tx_csum(struct net_device *dev) -{ - return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tx_csum); - -int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_IP_CSUM; - else - dev->features &= ~NETIF_F_IP_CSUM; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_csum); - -int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_HW_CSUM; - else - dev->features &= ~NETIF_F_HW_CSUM; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); - -int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; - else - dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); - -u32 ethtool_op_get_sg(struct net_device *dev) -{ - return (dev->features & NETIF_F_SG) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_sg); - -int ethtool_op_set_sg(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_SG; - else - dev->features &= ~NETIF_F_SG; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_sg); - -u32 ethtool_op_get_tso(struct net_device *dev) -{ - return (dev->features & NETIF_F_TSO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tso); - -int ethtool_op_set_tso(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_TSO; - else - dev->features &= ~NETIF_F_TSO; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tso); - -u32 ethtool_op_get_ufo(struct net_device *dev) -{ - return (dev->features & NETIF_F_UFO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_ufo); - -int ethtool_op_set_ufo(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_UFO; - else - dev->features &= ~NETIF_F_UFO; - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_ufo); - -/* the following list of flags are the same as their associated - * NETIF_F_xxx values in include/linux/netdevice.h - */ -static const u32 flags_dup_features = - (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | - ETH_FLAG_RXHASH); - -u32 ethtool_op_get_flags(struct net_device *dev) -{ - /* in the future, this function will probably contain additional - * handling for flags which are not so easily handled - * by a simple masking operation - */ - - return dev->features & flags_dup_features; -} -EXPORT_SYMBOL(ethtool_op_get_flags); - -/* Check if device can enable (or disable) particular feature coded in "data" - * argument. Flags "supported" describe features that can be toggled by device. - * If feature can not be toggled, it state (enabled or disabled) must match - * hardcoded device features state, otherwise flags are marked as invalid. - */ -bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) -{ - u32 features = dev->features & flags_dup_features; - /* "data" can contain only flags_dup_features bits, - * see __ethtool_set_flags */ - - return (features & ~supported) != (data & ~supported); -} -EXPORT_SYMBOL(ethtool_invalid_flags); - -int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) -{ - if (ethtool_invalid_flags(dev, data, supported)) - return -EINVAL; - - dev->features = ((dev->features & ~flags_dup_features) | - (data & flags_dup_features)); - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_flags); - /* Handlers for each ethtool command */ -#define ETHTOOL_DEV_FEATURE_WORDS 1 - -static void ethtool_get_features_compat(struct net_device *dev, - struct ethtool_get_features_block *features) -{ - if (!dev->ethtool_ops) - return; - - /* getting RX checksum */ - if (dev->ethtool_ops->get_rx_csum) - if (dev->ethtool_ops->get_rx_csum(dev)) - features[0].active |= NETIF_F_RXCSUM; - - /* mark legacy-changeable features */ - if (dev->ethtool_ops->set_sg) - features[0].available |= NETIF_F_SG; - if (dev->ethtool_ops->set_tx_csum) - features[0].available |= NETIF_F_ALL_CSUM; - if (dev->ethtool_ops->set_tso) - features[0].available |= NETIF_F_ALL_TSO; - if (dev->ethtool_ops->set_rx_csum) - features[0].available |= NETIF_F_RXCSUM; - if (dev->ethtool_ops->set_flags) - features[0].available |= flags_dup_features; -} - -static int ethtool_set_feature_compat(struct net_device *dev, - int (*legacy_set)(struct net_device *, u32), - struct ethtool_set_features_block *features, u32 mask) -{ - u32 do_set; - - if (!legacy_set) - return 0; - - if (!(features[0].valid & mask)) - return 0; - - features[0].valid &= ~mask; - - do_set = !!(features[0].requested & mask); - - if (legacy_set(dev, do_set) < 0) - netdev_info(dev, - "Legacy feature change (%s) failed for 0x%08x\n", - do_set ? "set" : "clear", mask); - - return 1; -} - -static int ethtool_set_flags_compat(struct net_device *dev, - int (*legacy_set)(struct net_device *, u32), - struct ethtool_set_features_block *features, u32 mask) -{ - u32 value; - - if (!legacy_set) - return 0; - - if (!(features[0].valid & mask)) - return 0; - - value = dev->features & ~features[0].valid; - value |= features[0].requested; - - features[0].valid &= ~mask; - - if (legacy_set(dev, value & mask) < 0) - netdev_info(dev, "Legacy flags change failed\n"); - - return 1; -} - -static int ethtool_set_features_compat(struct net_device *dev, - struct ethtool_set_features_block *features) -{ - int compat; - - if (!dev->ethtool_ops) - return 0; - - compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg, - features, NETIF_F_SG); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum, - features, NETIF_F_ALL_CSUM); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso, - features, NETIF_F_ALL_TSO); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum, - features, NETIF_F_RXCSUM); - compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags, - features, flags_dup_features); - - return compat; -} +#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) + +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", +}; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { @@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; - struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { - { - .available = dev->hw_features, - .requested = dev->wanted_features, - .active = dev->features, - .never_changed = NETIF_F_NEVER_CHANGE, - }, - }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; + int i; - ethtool_get_features_compat(dev, features); + /* in case feature bits run out again */ + BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + features[i].available = (u32)(dev->hw_features >> (32 * i)); + features[i].requested = (u32)(dev->wanted_features >> (32 * i)); + features[i].active = (u32)(dev->features >> (32 * i)); + features[i].never_changed = + (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); + } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) @@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - int ret = 0; + netdev_features_t wanted = 0, valid = 0; + int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; @@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; - if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) - return -EINVAL; + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + valid |= (netdev_features_t)features[i].valid << (32 * i); + wanted |= (netdev_features_t)features[i].requested << (32 * i); + } - if (ethtool_set_features_compat(dev, features)) - ret |= ETHTOOL_F_COMPAT; + if (valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; - if (features[0].valid & ~dev->hw_features) { - features[0].valid &= dev->hw_features; + if (valid & ~dev->hw_features) { + valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } - dev->wanted_features &= ~features[0].valid; - dev->wanted_features |= features[0].valid & features[0].requested; + dev->wanted_features &= ~valid; + dev->wanted_features |= wanted & valid; __netdev_update_features(dev); - if ((dev->wanted_features ^ dev->features) & features[0].valid) + if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } -static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { - /* NETIF_F_SG */ "tx-scatter-gather", - /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4", - /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded", - /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic", - /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6", - /* NETIF_F_HIGHDMA */ "highdma", - /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist", - /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert", - - /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse", - /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter", - /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", - /* NETIF_F_GSO */ "tx-generic-segmentation", - /* NETIF_F_LLTX */ "tx-lockless", - /* NETIF_F_NETNS_LOCAL */ "netns-local", - /* NETIF_F_GRO */ "rx-gro", - /* NETIF_F_LRO */ "rx-lro", - - /* NETIF_F_TSO */ "tx-tcp-segmentation", - /* NETIF_F_UFO */ "tx-udp-fragmentation", - /* NETIF_F_GSO_ROBUST */ "tx-gso-robust", - /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation", - /* NETIF_F_TSO6 */ "tx-tcp6-segmentation", - /* NETIF_F_FSO */ "tx-fcoe-segmentation", - "", - "", - - /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc", - /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp", - /* NETIF_F_FCOE_MTU */ "fcoe-mtu", - /* NETIF_F_NTUPLE */ "rx-ntuple-filter", - /* NETIF_F_RXHASH */ "rx-hashing", - /* NETIF_F_RXCSUM */ "rx-checksum", - /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy", - /* NETIF_F_LOOPBACK */ "loopback", -}; - static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev, ops->get_strings(dev, stringset, data); } -static u32 ethtool_get_feature_mask(u32 eth_cmd) +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ @@ -433,136 +210,82 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd) } } -static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops) - return NULL; - - switch (ethcmd) { - case ETHTOOL_GTXCSUM: - return ops->get_tx_csum; - case ETHTOOL_GRXCSUM: - return ops->get_rx_csum; - case ETHTOOL_SSG: - return ops->get_sg; - case ETHTOOL_STSO: - return ops->get_tso; - case ETHTOOL_SUFO: - return ops->get_ufo; - default: - return NULL; - } -} - -static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) -{ - return !!(dev->features & NETIF_F_ALL_CSUM); -} - static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { - u32 mask = ethtool_get_feature_mask(ethcmd); + netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; - /* compatibility with discrete get_ ops */ - if (!(dev->hw_features & mask)) { - u32 (*actor)(struct net_device *); - - actor = __ethtool_get_one_feature_actor(dev, ethcmd); - - /* bug compatibility with old get_rx_csum */ - if (ethcmd == ETHTOOL_GRXCSUM && !actor) - actor = __ethtool_get_rx_csum_oldbug; - - if (actor) - edata.data = actor(dev); - } - if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_sg(struct net_device *dev, u32 data); -static int __ethtool_set_tso(struct net_device *dev, u32 data); -static int __ethtool_set_ufo(struct net_device *dev, u32 data); - static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; - u32 mask; + netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; - if (mask) { - if (edata.data) - dev->wanted_features |= mask; - else - dev->wanted_features &= ~mask; + if (!mask) + return -EOPNOTSUPP; - __netdev_update_features(dev); - return 0; - } + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; - /* Driver is not converted to ndo_fix_features or does not - * support changing this offload. In the latter case it won't - * have corresponding ethtool_ops field set. - * - * Following part is to be removed after all drivers advertise - * their changeable features in netdev->hw_features and stop - * using discrete offload setting ops. - */ + __netdev_update_features(dev); - switch (ethcmd) { - case ETHTOOL_STXCSUM: - return __ethtool_set_tx_csum(dev, edata.data); - case ETHTOOL_SRXCSUM: - return __ethtool_set_rx_csum(dev, edata.data); - case ETHTOOL_SSG: - return __ethtool_set_sg(dev, edata.data); - case ETHTOOL_STSO: - return __ethtool_set_tso(dev, edata.data); - case ETHTOOL_SUFO: - return __ethtool_set_ufo(dev, edata.data); - default: - return -EOPNOTSUPP; - } + return 0; +} + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ + NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; + + return flags; } -int __ethtool_set_flags(struct net_device *dev, u32 data) +static int __ethtool_set_flags(struct net_device *dev, u32 data) { - u32 changed; + netdev_features_t features = 0, changed; - if (data & ~flags_dup_features) + if (data & ~ETH_ALL_FLAGS) return -EINVAL; - /* legacy set_flags() op */ - if (dev->ethtool_ops->set_flags) { - if (unlikely(dev->hw_features & flags_dup_features)) - netdev_warn(dev, - "driver BUG: mixed hw_features and set_flags()\n"); - return dev->ethtool_ops->set_flags(dev, data); - } + if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX; + if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX; + if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ - changed = (data ^ dev->features) & flags_dup_features; + changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = - (dev->wanted_features & ~changed) | (data & dev->hw_features); + (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); @@ -1231,81 +954,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); } -static int __ethtool_set_sg(struct net_device *dev, u32 data) -{ - int err; - - if (!dev->ethtool_ops->set_sg) - return -EOPNOTSUPP; - - if (data && !(dev->features & NETIF_F_ALL_CSUM)) - return -EINVAL; - - if (!data && dev->ethtool_ops->set_tso) { - err = dev->ethtool_ops->set_tso(dev, 0); - if (err) - return err; - } - - if (!data && dev->ethtool_ops->set_ufo) { - err = dev->ethtool_ops->set_ufo(dev, 0); - if (err) - return err; - } - return dev->ethtool_ops->set_sg(dev, data); -} - -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) -{ - int err; - - if (!dev->ethtool_ops->set_tx_csum) - return -EOPNOTSUPP; - - if (!data && dev->ethtool_ops->set_sg) { - err = __ethtool_set_sg(dev, 0); - if (err) - return err; - } - - return dev->ethtool_ops->set_tx_csum(dev, data); -} - -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_rx_csum) - return -EOPNOTSUPP; - - if (!data) - dev->features &= ~NETIF_F_GRO; - - return dev->ethtool_ops->set_rx_csum(dev, data); -} - -static int __ethtool_set_tso(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_tso) - return -EOPNOTSUPP; - - if (data && !(dev->features & NETIF_F_SG)) - return -EINVAL; - - return dev->ethtool_ops->set_tso(dev, data); -} - -static int __ethtool_set_ufo(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_ufo) - return -EOPNOTSUPP; - if (data && !(dev->features & NETIF_F_SG)) - return -EINVAL; - if (data && !((dev->features & NETIF_F_GEN_CSUM) || - (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) - return -EINVAL; - return dev->ethtool_ops->set_ufo(dev, data); -} - static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -1771,9 +1419,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_flags ? - dev->ethtool_ops->get_flags : - ethtool_op_get_flags)); + __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 000000000000..0985b9b14b80 --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,143 @@ +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <net/flow_keys.h> + +/* copy saddr & daddr, possibly using 64bit load/store + * Equivalent to : flow->src = iph->saddr; + * flow->dst = iph->daddr; + */ +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +{ + BUILD_BUG_ON(offsetof(typeof(*flow), dst) != + offsetof(typeof(*flow), src) + sizeof(flow->src)); + memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); +} + +bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + int poff, nhoff = skb_network_offset(skb); + u8 ip_proto; + __be16 proto = skb->protocol; + + memset(flow, 0, sizeof(*flow)); + +again: + switch (proto) { + case __constant_htons(ETH_P_IP): { + const struct iphdr *iph; + struct iphdr _iph; +ip: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ip_is_fragment(iph)) + ip_proto = 0; + else + ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); + nhoff += iph->ihl * 4; + break; + } + case __constant_htons(ETH_P_IPV6): { + const struct ipv6hdr *iph; + struct ipv6hdr _iph; +ipv6: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + ip_proto = iph->nexthdr; + flow->src = iph->saddr.s6_addr32[3]; + flow->dst = iph->daddr.s6_addr32[3]; + nhoff += sizeof(struct ipv6hdr); + break; + } + case __constant_htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan; + struct vlan_hdr _vlan; + + vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + if (!vlan) + return false; + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + goto again; + } + case __constant_htons(ETH_P_PPP_SES): { + struct { + struct pppoe_hdr hdr; + __be16 proto; + } *hdr, _hdr; + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + proto = hdr->proto; + nhoff += PPPOE_SES_HLEN; + switch (proto) { + case __constant_htons(PPP_IP): + goto ip; + case __constant_htons(PPP_IPV6): + goto ipv6; + default: + return false; + } + } + default: + return false; + } + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + } *hdr, _hdr; + + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + /* + * Only look inside GRE if version zero and no + * routing + */ + if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) + nhoff += 4; + if (hdr->flags & GRE_KEY) + nhoff += 4; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + goto again; + } + break; + } + case IPPROTO_IPIP: + goto again; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + __be32 *ports, _ports; + + nhoff += poff; + ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); + if (ports) + flow->ports = *ports; + } + + return true; +} +EXPORT_SYMBOL(skb_flow_dissect); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 039d51e6c284..4af151e1bf5d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) it to safe state. */ skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; @@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) goto out_entries; } - n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); + if (tbl->entry_size) + n = kzalloc(tbl->entry_size, GFP_ATOMIC); + else { + int sz = sizeof(*n) + tbl->key_len; + + sz = ALIGN(sz, NEIGH_PRIV_ALIGN); + sz += dev->neigh_priv_len; + n = kzalloc(sz, GFP_ATOMIC); + } if (!n) goto out_entries; @@ -462,7 +471,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -480,6 +489,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } + if (dev->netdev_ops->ndo_neigh_construct) { + error = dev->netdev_ops->ndo_neigh_construct(n); + if (error < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + } + /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { @@ -677,18 +694,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } -static void neigh_destroy_rcu(struct rcu_head *head) -{ - struct neighbour *neigh = container_of(head, struct neighbour, rcu); - - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); -} /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { + struct net_device *dev = neigh->dev; + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { @@ -702,14 +715,18 @@ void neigh_destroy(struct neighbour *neigh) printk(KERN_WARNING "Impossible event.\n"); skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + + if (dev->netdev_ops->ndo_neigh_destroy) + dev->netdev_ops->ndo_neigh_destroy(neigh); - dev_put(neigh->dev); + dev_put(dev); neigh_parms_put(neigh->parms); NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - call_rcu(&neigh->rcu, neigh_destroy_rcu); + kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -842,6 +859,7 @@ static void neigh_invalidate(struct neighbour *neigh) write_lock(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) @@ -980,15 +998,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { - if (skb_queue_len(&neigh->arp_queue) >= - neigh->parms->queue_len) { + while (neigh->arp_queue_len_bytes + skb->truesize > + neigh->parms->queue_len_bytes) { struct sk_buff *buff; + buff = __skb_dequeue(&neigh->arp_queue); + if (!buff) + break; + neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); + neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } @@ -1167,7 +1190,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, rcu_read_lock(); /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (dst && (n2 = dst_get_neighbour(dst)) != NULL) + if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) n1 = n2; n1->output(n1, skb); rcu_read_unlock(); @@ -1175,6 +1198,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_lock_bh(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) { @@ -1477,11 +1501,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); - if (!tbl->kmem_cachep) - tbl->kmem_cachep = - kmem_cache_create(tbl->id, tbl->entry_size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); @@ -1566,9 +1585,6 @@ int neigh_table_clear(struct neigh_table *tbl) free_percpu(tbl->stats); tbl->stats = NULL; - kmem_cache_destroy(tbl->kmem_cachep); - tbl->kmem_cachep = NULL; - return 0; } EXPORT_SYMBOL(neigh_table_clear); @@ -1747,7 +1763,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); - NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); + NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes); + /* approximative value for deprecated QUEUE_LEN (in packets) */ + NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, + DIV_ROUND_UP(parms->queue_len_bytes, + SKB_TRUESIZE(ETH_FRAME_LEN))); NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); @@ -1974,7 +1994,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) switch (i) { case NDTPA_QUEUE_LEN: - p->queue_len = nla_get_u32(tbp[i]); + p->queue_len_bytes = nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN); + break; + case NDTPA_QUEUE_LENBYTES: + p->queue_len_bytes = nla_get_u32(tbp[i]); break; case NDTPA_PROXY_QLEN: p->proxy_qlen = nla_get_u32(tbp[i]); @@ -2397,7 +2421,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; - pn = pn->next; + do { + pn = pn->next; + } while (pn && !net_eq(pneigh_net(pn), net)); + while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; @@ -2635,117 +2662,158 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL -#define NEIGH_VARS_MAX 19 +static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int size, ret; + ctl_table tmp = *ctl; + + tmp.data = &size; + size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write && !ret) + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); + return ret; +} + +enum { + NEIGH_VAR_MCAST_PROBE, + NEIGH_VAR_UCAST_PROBE, + NEIGH_VAR_APP_PROBE, + NEIGH_VAR_RETRANS_TIME, + NEIGH_VAR_BASE_REACHABLE_TIME, + NEIGH_VAR_DELAY_PROBE_TIME, + NEIGH_VAR_GC_STALETIME, + NEIGH_VAR_QUEUE_LEN, + NEIGH_VAR_QUEUE_LEN_BYTES, + NEIGH_VAR_PROXY_QLEN, + NEIGH_VAR_ANYCAST_DELAY, + NEIGH_VAR_PROXY_DELAY, + NEIGH_VAR_LOCKTIME, + NEIGH_VAR_RETRANS_TIME_MS, + NEIGH_VAR_BASE_REACHABLE_TIME_MS, + NEIGH_VAR_GC_INTERVAL, + NEIGH_VAR_GC_THRESH1, + NEIGH_VAR_GC_THRESH2, + NEIGH_VAR_GC_THRESH3, + NEIGH_VAR_MAX +}; static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VARS_MAX]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; char *dev_name; } neigh_sysctl_template __read_mostly = { .neigh_vars = { - { + [NEIGH_VAR_MCAST_PROBE] = { .procname = "mcast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_UCAST_PROBE] = { .procname = "ucast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_APP_PROBE] = { .procname = "app_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_RETRANS_TIME] = { .procname = "retrans_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_BASE_REACHABLE_TIME] = { .procname = "base_reachable_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_DELAY_PROBE_TIME] = { .procname = "delay_first_probe_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_GC_STALETIME] = { .procname = "gc_stale_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_QUEUE_LEN] = { .procname = "unres_qlen", .maxlen = sizeof(int), .mode = 0644, + .proc_handler = proc_unres_qlen, + }, + [NEIGH_VAR_QUEUE_LEN_BYTES] = { + .procname = "unres_qlen_bytes", + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_PROXY_QLEN] = { .procname = "proxy_qlen", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_ANYCAST_DELAY] = { .procname = "anycast_delay", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_PROXY_DELAY] = { .procname = "proxy_delay", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_LOCKTIME] = { .procname = "locktime", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_RETRANS_TIME_MS] = { .procname = "retrans_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - { + [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { .procname = "base_reachable_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - { + [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, @@ -2778,47 +2846,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (!t) goto err; - t->neigh_vars[0].data = &p->mcast_probes; - t->neigh_vars[1].data = &p->ucast_probes; - t->neigh_vars[2].data = &p->app_probes; - t->neigh_vars[3].data = &p->retrans_time; - t->neigh_vars[4].data = &p->base_reachable_time; - t->neigh_vars[5].data = &p->delay_probe_time; - t->neigh_vars[6].data = &p->gc_staletime; - t->neigh_vars[7].data = &p->queue_len; - t->neigh_vars[8].data = &p->proxy_qlen; - t->neigh_vars[9].data = &p->anycast_delay; - t->neigh_vars[10].data = &p->proxy_delay; - t->neigh_vars[11].data = &p->locktime; - t->neigh_vars[12].data = &p->retrans_time; - t->neigh_vars[13].data = &p->base_reachable_time; + t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; + t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; + t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; + t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; + t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; + t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; + t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; + t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; - t->neigh_vars[14].data = (int *)(p + 1); - t->neigh_vars[15].data = (int *)(p + 1) + 1; - t->neigh_vars[16].data = (int *)(p + 1) + 2; - t->neigh_vars[17].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; } if (handler) { /* RetransTime */ - t->neigh_vars[3].proc_handler = handler; - t->neigh_vars[3].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; /* ReachableTime */ - t->neigh_vars[4].proc_handler = handler; - t->neigh_vars[4].extra1 = dev; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; /* RetransTime (in milliseconds)*/ - t->neigh_vars[12].proc_handler = handler; - t->neigh_vars[12].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; /* ReachableTime (in milliseconds) */ - t->neigh_vars[13].proc_handler = handler; - t->neigh_vars[13].extra1 = dev; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; } t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c71c434a4c05..9d134636f87c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -21,6 +21,7 @@ #include <linux/wireless.h> #include <linux/vmalloc.h> #include <linux/export.h> +#include <linux/jiffies.h> #include <net/wext.h> #include "net-sysfs.h" @@ -606,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, rcu_assign_pointer(queue->rps_map, map); spin_unlock(&rps_map_lock); - if (old_map) + if (map) + jump_label_inc(&rps_needed); + if (old_map) { kfree_rcu(old_map, rcu); - + jump_label_dec(&rps_needed); + } free_cpumask_var(mask); return len; } @@ -780,7 +784,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif } -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ @@ -826,6 +830,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; +static ssize_t show_trans_timeout(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + unsigned long trans_timeout; + + spin_lock_irq(&queue->_xmit_lock); + trans_timeout = queue->trans_timeout; + spin_unlock_irq(&queue->_xmit_lock); + + return sprintf(buf, "%lu", trans_timeout); +} + +static struct netdev_queue_attribute queue_trans_timeout = + __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ + return sprintf(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, + unsigned int *pvalue) +{ + unsigned int value; + int err; + + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) + value = DQL_MAX_LIMIT; + else { + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + if (value > DQL_MAX_LIMIT) + return -EINVAL; + } + + *pvalue = value; + + return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + dql->slack_hold_time = msecs_to_jiffies(value); + + return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute = + __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, + bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute = + __ATTR(inflight, S_IRUGO | S_IWUSR, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD) \ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + char *buf) \ +{ \ + return bql_show(buf, queue->dql.FIELD); \ +} \ + \ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return bql_set(buf, len, &queue->dql.FIELD); \ +} \ + \ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ + __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ + bql_set_ ## NAME); + +BQL_ATTR(limit, limit) +BQL_ATTR(limit_max, max_limit) +BQL_ATTR(limit_min, min_limit) + +static struct attribute *dql_attrs[] = { + &bql_limit_attribute.attr, + &bql_limit_max_attribute.attr, + &bql_limit_min_attribute.attr, + &bql_hold_time_attribute.attr, + &bql_inflight_attribute.attr, + NULL +}; + +static struct attribute_group dql_group = { + .name = "byte_queue_limits", + .attrs = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; @@ -890,6 +1021,52 @@ static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) +static void xps_queue_release(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + kfree_rcu(map, rcu); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + } + mutex_unlock(&xps_map_mutex); +} + static ssize_t store_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, const char *buf, size_t len) @@ -901,7 +1078,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, struct xps_map *map, *new_map; struct xps_dev_maps *dev_maps, *new_dev_maps; int nonempty = 0; - int numa_node = -2; + int numa_node_id = -2; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -944,10 +1121,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue, need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); #ifdef CONFIG_NUMA if (need_set) { - if (numa_node == -2) - numa_node = cpu_to_node(cpu); - else if (numa_node != cpu_to_node(cpu)) - numa_node = -1; + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; } #endif if (need_set && pos >= map_len) { @@ -997,7 +1174,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, if (dev_maps) kfree_rcu(dev_maps, rcu); - netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); mutex_unlock(&xps_map_mutex); @@ -1020,58 +1197,23 @@ error: static struct netdev_queue_attribute xps_cpus_attribute = __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +#endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] = { + &queue_trans_timeout.attr, +#ifdef CONFIG_XPS &xps_cpus_attribute.attr, +#endif NULL }; static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); - struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - struct xps_map *map; - unsigned long index; - int i, pos, nonempty = 0; - index = get_netdev_queue_index(queue); - - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (dev_maps) { - for_each_possible_cpu(i) { - map = xmap_dereference(dev_maps->cpu_map[i]); - if (!map) - continue; - - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - - if (pos < map->len) { - if (map->len > 1) - map->queues[pos] = - map->queues[--map->len]; - else { - RCU_INIT_POINTER(dev_maps->cpu_map[i], - NULL); - kfree_rcu(map, rcu); - map = NULL; - } - } - if (map) - nonempty = 1; - } - - if (!nonempty) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - } - - mutex_unlock(&xps_map_mutex); +#ifdef CONFIG_XPS + xps_queue_release(queue); +#endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); @@ -1092,22 +1234,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index) kobj->kset = net->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto exit; + +#ifdef CONFIG_BQL + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto exit; +#endif kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); + return 0; +exit: + kobject_put(kobj); return error; } -#endif /* CONFIG_XPS */ +#endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS int i; int error = 0; @@ -1119,20 +1268,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) - kobject_put(&net->_tx[i].kobj); + while (--i >= new_num) { + struct netdev_queue *queue = net->_tx + i; + +#ifdef CONFIG_BQL + sysfs_remove_group(&queue->kobj, &dql_group); +#endif + kobject_put(&queue->kobj); + } return error; #else return 0; -#endif +#endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *net) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); if (!net->queues_kset) @@ -1173,7 +1328,7 @@ static void remove_queue_kobjects(struct net_device *net) net_rx_queue_update_kobjects(net, real_rx, 0); netdev_queue_update_kobjects(net, real_tx, 0); -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS kset_unregister(net->queues_kset); #endif } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index cf64c1ffa4cd..0d38808a2305 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -76,7 +76,7 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_frozen_or_stopped(txq) || + if (netif_xmit_frozen_or_stopped(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); @@ -317,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (__netif_tx_trylock(txq)) { - if (!netif_tx_queue_stopped(txq)) { + if (!netif_xmit_stopped(txq)) { status = ops->ndo_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); @@ -422,6 +422,7 @@ static void arp_reply(struct sk_buff *skb) struct sk_buff *send_skb; struct netpoll *np, *tmp; unsigned long flags; + int hlen, tlen; int hits = 0; if (list_empty(&npinfo->rx_np)) @@ -479,8 +480,9 @@ static void arp_reply(struct sk_buff *skb) if (tip != np->local_ip) continue; - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); + hlen = LL_RESERVED_SPACE(np->dev); + tlen = np->dev->needed_tailroom; + send_skb = find_skb(np, size + hlen + tlen, hlen); if (!send_skb) continue; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 000000000000..3a9fd4826b75 --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,344 @@ +/* + * net/core/netprio_cgroup.c Priority Control Group + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_prio_subsys = { + .name = "net_prio", + .create = cgrp_create, + .destroy = cgrp_destroy, + .populate = cgrp_populate, +#ifdef CONFIG_NETPRIO_CGROUP + .subsys_id = net_prio_subsys_id, +#endif + .module = THIS_MODULE +}; + +#define PRIOIDX_SZ 128 + +static unsigned long prioidx_map[PRIOIDX_SZ]; +static DEFINE_SPINLOCK(prioidx_map_lock); +static atomic_t max_prioidx = ATOMIC_INIT(0); + +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + struct cgroup_netprio_state, css); +} + +static int get_prioidx(u32 *prio) +{ + unsigned long flags; + u32 prioidx; + + spin_lock_irqsave(&prioidx_map_lock, flags); + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + set_bit(prioidx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) + return -ENOSPC; + + atomic_set(&max_prioidx, prioidx); + *prio = prioidx; + return 0; +} + +static void put_prioidx(u32 idx) +{ + unsigned long flags; + + spin_lock_irqsave(&prioidx_map_lock, flags); + clear_bit(idx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); +} + +static void extend_netdev_table(struct net_device *dev, u32 new_len) +{ + size_t new_size = sizeof(struct netprio_map) + + ((sizeof(u32) * new_len)); + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); + struct netprio_map *old_priomap; + int i; + + old_priomap = rtnl_dereference(dev->priomap); + + if (!new_priomap) { + printk(KERN_WARNING "Unable to alloc new priomap!\n"); + return; + } + + for (i = 0; + old_priomap && (i < old_priomap->priomap_len); + i++) + new_priomap->priomap[i] = old_priomap->priomap[i]; + + new_priomap->priomap_len = new_len; + + rcu_assign_pointer(dev->priomap, new_priomap); + if (old_priomap) + kfree_rcu(old_priomap, rcu); +} + +static void update_netdev_tables(void) +{ + struct net_device *dev; + u32 max_len = atomic_read(&max_prioidx); + struct netprio_map *map; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if ((!map) || + (map->priomap_len < max_len)) + extend_netdev_table(dev, max_len); + } + rtnl_unlock(); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + int ret; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { + kfree(cs); + return ERR_PTR(-EINVAL); + } + + ret = get_prioidx(&cs->prioidx); + if (ret != 0) { + printk(KERN_WARNING "No space in priority index array\n"); + kfree(cs); + return ERR_PTR(ret); + } + + return &cs->css; +} + +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + struct net_device *dev; + struct netprio_map *map; + + cs = cgrp_netprio_state(cgrp); + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if (map) + map->priomap[cs->prioidx] = 0; + } + rtnl_unlock(); + put_prioidx(cs->prioidx); + kfree(cs); +} + +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +{ + return (u64)cgrp_netprio_state(cgrp)->prioidx; +} + +static int read_priomap(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct net_device *dev; + u32 prioidx = cgrp_netprio_state(cont)->prioidx; + u32 priority; + struct netprio_map *map; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + map = rcu_dereference(dev->priomap); + priority = map ? map->priomap[prioidx] : 0; + cb->fill(cb, dev->name, priority); + } + rcu_read_unlock(); + return 0; +} + +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + char *devname = kstrdup(buffer, GFP_KERNEL); + int ret = -EINVAL; + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; + unsigned long priority; + char *priostr; + struct net_device *dev; + struct netprio_map *map; + + if (!devname) + return -ENOMEM; + + /* + * Minimally sized valid priomap string + */ + if (strlen(devname) < 3) + goto out_free_devname; + + priostr = strstr(devname, " "); + if (!priostr) + goto out_free_devname; + + /* + *Separate the devname from the associated priority + *and advance the priostr poitner to the priority value + */ + *priostr = '\0'; + priostr++; + + /* + * If the priostr points to NULL, we're at the end of the passed + * in string, and its not a valid write + */ + if (*priostr == '\0') + goto out_free_devname; + + ret = kstrtoul(priostr, 10, &priority); + if (ret < 0) + goto out_free_devname; + + ret = -ENODEV; + + dev = dev_get_by_name(&init_net, devname); + if (!dev) + goto out_free_devname; + + update_netdev_tables(); + ret = 0; + rcu_read_lock(); + map = rcu_dereference(dev->priomap); + if (map) + map->priomap[prioidx] = priority; + rcu_read_unlock(); + dev_put(dev); + +out_free_devname: + kfree(devname); + return ret; +} + +static struct cftype ss_files[] = { + { + .name = "prioidx", + .read_u64 = read_prioidx, + }, + { + .name = "ifpriomap", + .read_map = read_priomap, + .write_string = write_priomap, + }, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +static int netprio_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct netprio_map *old; + u32 max_len = atomic_read(&max_prioidx); + + /* + * Note this is called with rtnl_lock held so we have update side + * protection on our rcu assignments + */ + + switch (event) { + + case NETDEV_REGISTER: + if (max_len) + extend_netdev_table(dev, max_len); + break; + case NETDEV_UNREGISTER: + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { + .notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ + int ret; + + ret = cgroup_load_subsys(&net_prio_subsys); + if (ret) + goto out; +#ifndef CONFIG_NETPRIO_CGROUP + smp_wmb(); + net_prio_subsys_id = net_prio_subsys.subsys_id; +#endif + + register_netdevice_notifier(&netprio_device_notifier); + +out: + return ret; +} + +static void __exit exit_cgroup_netprio(void) +{ + struct netprio_map *old; + struct net_device *dev; + + unregister_netdevice_notifier(&netprio_device_notifier); + + cgroup_unload_subsys(&net_prio_subsys); + +#ifndef CONFIG_NETPRIO_CGROUP + net_prio_subsys_id = -1; + synchronize_rcu(); +#endif + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + } + rtnl_unlock(); +} + +module_init(init_cgroup_netprio); +module_exit(exit_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0001c243b35c..449fe0f068f8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); - ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; if (debug) printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); @@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); - ipv6_addr_copy(&pkt_dev->cur_in6_daddr, - &pkt_dev->min_in6_daddr); + pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; if (debug) printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); @@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); - ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; if (debug) printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); @@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ifp = ifp->if_next) { if (ifp->scope == IFA_LINK && !(ifp->flags & IFA_F_TENTATIVE)) { - ipv6_addr_copy(&pkt_dev-> - cur_in6_saddr, - &ifp->addr); + pkt_dev->cur_in6_saddr = ifp->addr; err = 0; break; } @@ -2958,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->payload_len = htons(sizeof(struct udphdr) + datalen); iph->nexthdr = IPPROTO_UDP; - ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); - ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + iph->daddr = pkt_dev->cur_in6_daddr; + iph->saddr = pkt_dev->cur_in6_saddr; skb->mac_header = (skb->network_header - ETH_HLEN - pkt_dev->pkt_overhead); @@ -3345,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06438f926022..fd3646209b65 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -245,6 +245,55 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** + * build_skb - build a network buffer + * @data: data buffer provided by caller + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + * Before IO, driver allocates only data buffer where NIC put incoming frame + * Driver should add room at head (NET_SKB_PAD) and + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + * After IO, driver calls build_skb(), to allocate sk_buff and populate it + * before giving packet to stack. + * RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *build_skb(void *data) +{ + struct skb_shared_info *shinfo; + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (!skb) + return NULL; + + size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->truesize = SKB_TRUESIZE(size); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); + + return skb; +} +EXPORT_SYMBOL(build_skb); + +/** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate @@ -791,8 +840,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) EXPORT_SYMBOL(skb_copy); /** - * pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy - create copy of an sk_buff with private head. * @skb: buffer to copy + * @headroom: headroom of new skb * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and part of its data, located @@ -803,16 +853,16 @@ EXPORT_SYMBOL(skb_copy); * The returned buffer has a reference count of 1. */ -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { - unsigned int size = skb_end_pointer(skb) - skb->head; + unsigned int size = skb_headlen(skb) + headroom; struct sk_buff *n = alloc_skb(size, gfp_mask); if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb_headroom(skb)); + skb_reserve(n, headroom); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -848,7 +898,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) out: return n; } -EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(__pskb_copy); /** * pskb_expand_head - reallocate header of &sk_buff @@ -2230,7 +2280,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb) * @shiftlen: shift up to this many bytes * * Attempts to shift up to shiftlen worth of bytes, which may be less than - * the length of the skb, from tgt to skb. Returns number bytes shifted. + * the length of the skb, from skb to tgt. Returns number bytes shifted. * It's up to caller to free skb if everything was shifted. * * If @tgt runs out of frags, the whole operation is aborted. @@ -2621,7 +2671,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; diff --git a/net/core/sock.c b/net/core/sock.c index cbdf51c0d5ac..9777da86aeac 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -125,6 +125,7 @@ #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> #include <linux/filter.h> @@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; EXPORT_SYMBOL_GPL(net_cls_subsys_id); #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -269,14 +276,14 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { - if (sock_flag(sk, flag)) { - sock_reset_flag(sk, flag); - if (!sock_flag(sk, SOCK_TIMESTAMP) && - !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); - } } } @@ -682,7 +689,7 @@ set_rcvbuf: SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, val & SOF_TIMESTAMPING_SOFTWARE); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -1120,6 +1127,18 @@ void sock_update_classid(struct sock *sk) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ + struct cgroup_netprio_state *state; + if (in_interrupt()) + return; + rcu_read_lock(); + state = task_netprio_state(current); + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif /** @@ -1147,6 +1166,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; @@ -1167,8 +1187,7 @@ static void __sk_free(struct sock *sk) RCU_INIT_POINTER(sk->sk_filter, NULL); } - sock_disable_timestamp(sk, SOCK_TIMESTAMP); - sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1213,7 +1232,14 @@ void sk_release_kernel(struct sock *sk) } EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; @@ -1299,14 +1325,13 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) percpu_counter_inc(newsk->sk_prot->sockets_allocated); - if (sock_flag(newsk, SOCK_TIMESTAMP) || - sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: return newsk; } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { @@ -2138,16 +2163,15 @@ EXPORT_SYMBOL(sock_get_timestampns); void sock_enable_timestamp(struct sock *sk, int flag) { if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ - if (!sock_flag(sk, - flag == SOCK_TIMESTAMP ? - SOCK_TIMESTAMPING_RX_SOFTWARE : - SOCK_TIMESTAMP)) + if (!(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 77a65f031488..d05559d4d9cd 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - synchronize_rcu(); - vfree(orig_sock_table); + if (sock_table) + jump_label_inc(&rps_needed); + if (orig_sock_table) { + jump_label_dec(&rps_needed); + synchronize_rcu(); + vfree(orig_sock_table); + } } } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 90a919afbed7..1c67fe8ff90d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -111,6 +111,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { + err = PTR_ERR(rt); rt = NULL; goto failure; } @@ -473,10 +474,11 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; + const struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .flowi4_oif = skb_rtable(skb)->rt_iif, - .daddr = ip_hdr(skb)->saddr, - .saddr = ip_hdr(skb)->daddr, + .daddr = iph->saddr, + .saddr = iph->daddr, .flowi4_tos = RT_CONN_FLAGS(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 17ee85ce148d..ce903f747e64 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -150,8 +150,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_DCCP; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; @@ -244,8 +244,8 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_DCCP; - ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); - ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.daddr = ireq6->rmt_addr; + fl6.saddr = ireq6->loc_addr; fl6.flowlabel = 0; fl6.flowi6_oif = ireq6->iif; fl6.fl6_dport = inet_rsk(req)->rmt_port; @@ -270,7 +270,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, dh->dccph_checksum = dccp_v6_csum_finish(skb, &ireq6->loc_addr, &ireq6->rmt_addr); - ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + fl6.daddr = ireq6->rmt_addr; err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); err = net_xmit_eval(err); } @@ -313,8 +313,8 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) &rxip6h->daddr); memset(&fl6, 0, sizeof(fl6)); - ipv6_addr_copy(&fl6.daddr, &rxip6h->saddr); - ipv6_addr_copy(&fl6.saddr, &rxip6h->daddr); + fl6.daddr = rxip6h->saddr; + fl6.saddr = rxip6h->daddr; fl6.flowi6_proto = IPPROTO_DCCP; fl6.flowi6_oif = inet6_iif(rxskb); @@ -419,8 +419,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; ireq6 = inet6_rsk(req); - ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); + ireq6->rmt_addr = ipv6_hdr(skb)->saddr; + ireq6->loc_addr = ipv6_hdr(skb)->daddr; if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || @@ -491,7 +491,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + newnp->rcv_saddr = newnp->saddr; inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; newsk->sk_backlog_rcv = dccp_v4_do_rcv; @@ -526,9 +526,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_DCCP; - ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + fl6.daddr = ireq6->rmt_addr; final_p = fl6_update_dst(&fl6, opt, &final); - ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.saddr = ireq6->loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = inet_rsk(req)->rmt_port; fl6.fl6_sport = inet_rsk(req)->loc_port; @@ -559,9 +559,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); - ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); - ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newnp->daddr = ireq6->rmt_addr; + newnp->saddr = ireq6->loc_addr; + newnp->rcv_saddr = ireq6->loc_addr; newsk->sk_bound_dev_if = ireq6->iif; /* Now IPv6 options... @@ -877,7 +877,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } @@ -910,7 +910,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; } - ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* @@ -949,8 +949,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, saddr = &np->rcv_saddr; fl6.flowi6_proto = IPPROTO_DCCP; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, saddr ? saddr : &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; @@ -966,11 +966,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (saddr == NULL) { saddr = &fl6.saddr; - ipv6_addr_copy(&np->rcv_saddr, saddr); + np->rcv_saddr = *saddr; } /* set the source address */ - ipv6_addr_copy(&np->saddr, saddr); + np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; __ip6_dst_store(sk, dst, NULL, NULL); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index d7041a0963af..b50d5fd3d696 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -60,8 +60,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); tw6 = inet6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw6->tw_v6_daddr = np->daddr; + tw6->tw_v6_rcv_saddr = np->rcv_saddr; tw->tw_ipv6only = np->ipv6only; } #endif @@ -100,7 +100,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * (* Generate a new socket and switch to that socket *) * Set S := new socket for this port pair */ - struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); if (newsk != NULL) { struct dccp_request_sock *dreq = dccp_rsk(req); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 33d0e6297c21..0a8d6ebd9b45 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -152,6 +152,17 @@ static const struct file_operations dccpprobe_fops = { .llseek = noop_llseek, }; +static __init int setup_jprobe(void) +{ + int ret = register_jprobe(&dccp_send_probe); + + if (ret) { + request_module("dccp"); + ret = register_jprobe(&dccp_send_probe); + } + return ret; +} + static __init int dccpprobe_init(void) { int ret = -ENOMEM; @@ -163,8 +174,7 @@ static __init int dccpprobe_init(void) if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops)) goto err0; - try_then_request_module((ret = register_jprobe(&dccp_send_probe)) == 0, - "dccp"); + ret = setup_jprobe(); if (ret) goto err1; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 7f0eb087dc11..7d2fff29380f 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -107,7 +107,7 @@ struct neigh_table dn_neigh_table = { .gc_staletime = 60 * HZ, .reachable_time = 30 * HZ, .delay_probe_time = 5 * HZ, - .queue_len = 3, + .queue_len_bytes = 64*1024, .ucast_probes = 0, .app_probes = 0, .mcast_probes = 0, @@ -202,7 +202,7 @@ static int dn_neigh_output_packet(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; - struct neighbour *neigh = dst_get_neighbour(dst); + struct neighbour *neigh = dst_get_neighbour_noref(dst); struct net_device *dev = neigh->dev; char mac_addr[ETH_ALEN]; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index a77d16158eb6..f31ce72dca65 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -112,7 +112,7 @@ static unsigned long dn_rt_deadline; static int dn_dst_gc(struct dst_ops *ops); static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); static unsigned int dn_dst_default_advmss(const struct dst_entry *dst); -static unsigned int dn_dst_default_mtu(const struct dst_entry *dst); +static unsigned int dn_dst_mtu(const struct dst_entry *dst); static void dn_dst_destroy(struct dst_entry *); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); @@ -135,7 +135,7 @@ static struct dst_ops dn_dst_ops = { .gc = dn_dst_gc, .check = dn_dst_check, .default_advmss = dn_dst_default_advmss, - .default_mtu = dn_dst_default_mtu, + .mtu = dn_dst_mtu, .cow_metrics = dst_cow_metrics_generic, .destroy = dn_dst_destroy, .negative_advice = dn_dst_negative_advice, @@ -244,7 +244,7 @@ static int dn_dst_gc(struct dst_ops *ops) */ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) { - struct neighbour *n = dst_get_neighbour(dst); + struct neighbour *n = dst_get_neighbour_noref(dst); u32 min_mtu = 230; struct dn_dev *dn; @@ -713,7 +713,7 @@ out: static int dn_to_neigh_output(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct neighbour *n = dst_get_neighbour(dst); + struct neighbour *n = dst_get_neighbour_noref(dst); return n->output(n, skb); } @@ -728,7 +728,7 @@ static int dn_output(struct sk_buff *skb) int err = -EINVAL; - if ((neigh = dst_get_neighbour(dst)) == NULL) + if ((neigh = dst_get_neighbour_noref(dst)) == NULL) goto error; skb->dev = dev; @@ -825,9 +825,11 @@ static unsigned int dn_dst_default_advmss(const struct dst_entry *dst) return dn_mss_from_pmtu(dst->dev, dst_mtu(dst)); } -static unsigned int dn_dst_default_mtu(const struct dst_entry *dst) +static unsigned int dn_dst_mtu(const struct dst_entry *dst) { - return dst->dev->mtu; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; } static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) @@ -850,7 +852,7 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) } rt->rt_type = res->type; - if (dev != NULL && dst_get_neighbour(&rt->dst) == NULL) { + if (dev != NULL && dst_get_neighbour_noref(&rt->dst) == NULL) { n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev); if (IS_ERR(n)) return PTR_ERR(n); diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c index 67f691bd4acf..d9c150cc59a9 100644 --- a/net/decnet/dn_timer.c +++ b/net/decnet/dn_timer.c @@ -36,16 +36,13 @@ static void dn_slow_timer(unsigned long arg); void dn_start_slow_timer(struct sock *sk) { - sk->sk_timer.expires = jiffies + SLOW_INTERVAL; - sk->sk_timer.function = dn_slow_timer; - sk->sk_timer.data = (unsigned long)sk; - - add_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, dn_slow_timer, (unsigned long)sk); + sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL); } void dn_stop_slow_timer(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } static void dn_slow_timer(unsigned long arg) @@ -53,12 +50,10 @@ static void dn_slow_timer(unsigned long arg) struct sock *sk = (struct sock *)arg; struct dn_scp *scp = DN_SK(sk); - sock_hold(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk->sk_timer.expires = jiffies + HZ / 10; - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 10); goto out; } @@ -100,9 +95,7 @@ static void dn_slow_timer(unsigned long arg) scp->keepalive_fxn(sk); } - sk->sk_timer.expires = jiffies + SLOW_INTERVAL; - - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL); out: bh_unlock_sock(sk); sock_put(sk); diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index c53ded2a98df..274791cd7a35 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,5 +1,5 @@ -menuconfig NET_DSA - bool "Distributed Switch Architecture support" +config NET_DSA + tristate "Distributed Switch Architecture support" default n depends on EXPERIMENTAL && NETDEVICES && !S390 select PHYLIB @@ -23,38 +23,4 @@ config NET_DSA_TAG_TRAILER bool default n - -# switch drivers -config NET_DSA_MV88E6XXX - bool - default n - -config NET_DSA_MV88E6060 - bool "Marvell 88E6060 ethernet switch chip support" - select NET_DSA_TAG_TRAILER - ---help--- - This enables support for the Marvell 88E6060 ethernet switch - chip. - -config NET_DSA_MV88E6XXX_NEED_PPU - bool - default n - -config NET_DSA_MV88E6131 - bool "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support" - select NET_DSA_MV88E6XXX - select NET_DSA_MV88E6XXX_NEED_PPU - select NET_DSA_TAG_DSA - ---help--- - This enables support for the Marvell 88E6085/6095/6095F/6131 - ethernet switch chips. - -config NET_DSA_MV88E6123_61_65 - bool "Marvell 88E6123/6161/6165 ethernet switch chip support" - select NET_DSA_MV88E6XXX - select NET_DSA_TAG_EDSA - ---help--- - This enables support for the Marvell 88E6123/6161/6165 - ethernet switch chips. - endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 2374faff4dea..7b9fcbbeda5d 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,13 +1,8 @@ -# tagging formats -obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o -obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o -obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o - -# switch drivers -obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o -obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o -obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o -obj-$(CONFIG_NET_DSA_MV88E6131) += mv88e6131.o - # the core -obj-$(CONFIG_NET_DSA) += dsa.o slave.o +obj-$(CONFIG_NET_DSA) += dsa_core.o +dsa_core-y += dsa.o slave.o + +# tagging formats +dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o +dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0dc1589343c3..88e7c2f3fa0d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -29,6 +29,7 @@ void register_switch_driver(struct dsa_switch_driver *drv) list_add_tail(&drv->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } +EXPORT_SYMBOL_GPL(register_switch_driver); void unregister_switch_driver(struct dsa_switch_driver *drv) { @@ -36,6 +37,7 @@ void unregister_switch_driver(struct dsa_switch_driver *drv) list_del_init(&drv->list); mutex_unlock(&dsa_switch_drivers_mutex); } +EXPORT_SYMBOL_GPL(unregister_switch_driver); static struct dsa_switch_driver * dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) @@ -199,29 +201,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds) } -/* hooks for ethertype-less tagging formats *********************************/ -/* - * The original DSA tag format and some other tag formats have no - * ethertype, which means that we need to add a little hack to the - * networking receive path to make sure that received frames get - * the right ->protocol assigned to them when one of those tag - * formats is in use. - */ -bool dsa_uses_dsa_tags(void *dsa_ptr) -{ - struct dsa_switch_tree *dst = dsa_ptr; - - return !!(dst->tag_protocol == htons(ETH_P_DSA)); -} - -bool dsa_uses_trailer_tags(void *dsa_ptr) -{ - struct dsa_switch_tree *dst = dsa_ptr; - - return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); -} - - /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) { @@ -419,12 +398,36 @@ static struct platform_driver dsa_driver = { static int __init dsa_init_module(void) { - return platform_driver_register(&dsa_driver); + int rc; + + rc = platform_driver_register(&dsa_driver); + if (rc) + return rc; + +#ifdef CONFIG_NET_DSA_TAG_DSA + dev_add_pack(&dsa_packet_type); +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + dev_add_pack(&edsa_packet_type); +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + dev_add_pack(&trailer_packet_type); +#endif + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { +#ifdef CONFIG_NET_DSA_TAG_TRAILER + dev_remove_pack(&trailer_packet_type); +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + dev_remove_pack(&edsa_packet_type); +#endif +#ifdef CONFIG_NET_DSA_TAG_DSA + dev_remove_pack(&dsa_packet_type); +#endif platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 4b0ea0540442..d4cf5cc747e3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -11,97 +11,9 @@ #ifndef __DSA_PRIV_H #define __DSA_PRIV_H -#include <linux/list.h> #include <linux/phy.h> -#include <linux/timer.h> -#include <linux/workqueue.h> #include <net/dsa.h> -struct dsa_switch { - /* - * Parent switch tree, and switch index. - */ - struct dsa_switch_tree *dst; - int index; - - /* - * Configuration data for this switch. - */ - struct dsa_chip_data *pd; - - /* - * The used switch driver. - */ - struct dsa_switch_driver *drv; - - /* - * Reference to mii bus to use. - */ - struct mii_bus *master_mii_bus; - - /* - * Slave mii_bus and devices for the individual ports. - */ - u32 dsa_port_mask; - u32 phys_port_mask; - struct mii_bus *slave_mii_bus; - struct net_device *ports[DSA_MAX_PORTS]; -}; - -struct dsa_switch_tree { - /* - * Configuration data for the platform device that owns - * this dsa switch tree instance. - */ - struct dsa_platform_data *pd; - - /* - * Reference to network device to use, and which tagging - * protocol to use. - */ - struct net_device *master_netdev; - __be16 tag_protocol; - - /* - * The switch and port to which the CPU is attached. - */ - s8 cpu_switch; - s8 cpu_port; - - /* - * Link state polling. - */ - int link_poll_needed; - struct work_struct link_poll_work; - struct timer_list link_poll_timer; - - /* - * Data for the individual switch chips. - */ - struct dsa_switch *ds[DSA_MAX_SWITCHES]; -}; - -static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) -{ - return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port); -} - -static inline u8 dsa_upstream_port(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst = ds->dst; - - /* - * If this is the root switch (i.e. the switch that connects - * to the CPU), return the cpu port number on this switch. - * Else return the (DSA) port number that connects to the - * switch that is one hop closer to the cpu. - */ - if (dst->cpu_switch == ds->index) - return dst->cpu_port; - else - return ds->pd->rtable[dst->cpu_switch]; -} - struct dsa_slave_priv { /* * The linux network interface corresponding to this @@ -123,44 +35,8 @@ struct dsa_slave_priv { struct phy_device *phy; }; -struct dsa_switch_driver { - struct list_head list; - - __be16 tag_protocol; - int priv_size; - - /* - * Probing and setup. - */ - char *(*probe)(struct mii_bus *bus, int sw_addr); - int (*setup)(struct dsa_switch *ds); - int (*set_addr)(struct dsa_switch *ds, u8 *addr); - - /* - * Access to the switch's PHY registers. - */ - int (*phy_read)(struct dsa_switch *ds, int port, int regnum); - int (*phy_write)(struct dsa_switch *ds, int port, - int regnum, u16 val); - - /* - * Link state polling and IRQ handling. - */ - void (*poll_link)(struct dsa_switch *ds); - - /* - * ethtool hardware statistics. - */ - void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); - void (*get_ethtool_stats)(struct dsa_switch *ds, - int port, uint64_t *data); - int (*get_sset_count)(struct dsa_switch *ds); -}; - /* dsa.c */ extern char dsa_driver_version[]; -void register_switch_driver(struct dsa_switch_driver *type); -void unregister_switch_driver(struct dsa_switch_driver *type); /* slave.c */ void dsa_slave_mii_bus_init(struct dsa_switch *ds); @@ -170,12 +46,15 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds, /* tag_dsa.c */ netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); +extern struct packet_type dsa_packet_type; /* tag_edsa.c */ netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); +extern struct packet_type edsa_packet_type; /* tag_trailer.c */ netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); +extern struct packet_type trailer_packet_type; #endif diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c deleted file mode 100644 index 8f4ff5a2c813..000000000000 --- a/net/dsa/mv88e6060.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips - * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/phy.h> -#include "dsa_priv.h" - -#define REG_PORT(p) (8 + (p)) -#define REG_GLOBAL 0x0f - -static int reg_read(struct dsa_switch *ds, int addr, int reg) -{ - return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg); -} - -#define REG_READ(addr, reg) \ - ({ \ - int __ret; \ - \ - __ret = reg_read(ds, addr, reg); \ - if (__ret < 0) \ - return __ret; \ - __ret; \ - }) - - -static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) -{ - return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr, - reg, val); -} - -#define REG_WRITE(addr, reg, val) \ - ({ \ - int __ret; \ - \ - __ret = reg_write(ds, addr, reg, val); \ - if (__ret < 0) \ - return __ret; \ - }) - -static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr) -{ - int ret; - - ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03); - if (ret >= 0) { - ret &= 0xfff0; - if (ret == 0x0600) - return "Marvell 88E6060"; - } - - return NULL; -} - -static int mv88e6060_switch_reset(struct dsa_switch *ds) -{ - int i; - int ret; - - /* - * Set all ports to the disabled state. - */ - for (i = 0; i < 6; i++) { - ret = REG_READ(REG_PORT(i), 0x04); - REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); - } - - /* - * Wait for transmit queues to drain. - */ - msleep(2); - - /* - * Reset the switch. - */ - REG_WRITE(REG_GLOBAL, 0x0a, 0xa130); - - /* - * Wait up to one second for reset to complete. - */ - for (i = 0; i < 1000; i++) { - ret = REG_READ(REG_GLOBAL, 0x00); - if ((ret & 0x8000) == 0x0000) - break; - - msleep(1); - } - if (i == 1000) - return -ETIMEDOUT; - - return 0; -} - -static int mv88e6060_setup_global(struct dsa_switch *ds) -{ - /* - * Disable discarding of frames with excessive collisions, - * set the maximum frame size to 1536 bytes, and mask all - * interrupt sources. - */ - REG_WRITE(REG_GLOBAL, 0x04, 0x0800); - - /* - * Enable automatic address learning, set the address - * database size to 1024 entries, and set the default aging - * time to 5 minutes. - */ - REG_WRITE(REG_GLOBAL, 0x0a, 0x2130); - - return 0; -} - -static int mv88e6060_setup_port(struct dsa_switch *ds, int p) -{ - int addr = REG_PORT(p); - - /* - * Do not force flow control, disable Ingress and Egress - * Header tagging, disable VLAN tunneling, and set the port - * state to Forwarding. Additionally, if this is the CPU - * port, enable Ingress and Egress Trailer tagging mode. - */ - REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003); - - /* - * Port based VLAN map: give each port its own address - * database, allow the CPU port to talk to each of the 'real' - * ports, and allow each of the 'real' ports to only talk to - * the CPU port. - */ - REG_WRITE(addr, 0x06, - ((p & 0xf) << 12) | - (dsa_is_cpu_port(ds, p) ? - ds->phys_port_mask : - (1 << ds->dst->cpu_port))); - - /* - * Port Association Vector: when learning source addresses - * of packets, add the address to the address database using - * a port bitmap that has only the bit for this port set and - * the other bits clear. - */ - REG_WRITE(addr, 0x0b, 1 << p); - - return 0; -} - -static int mv88e6060_setup(struct dsa_switch *ds) -{ - int i; - int ret; - - ret = mv88e6060_switch_reset(ds); - if (ret < 0) - return ret; - - /* @@@ initialise atu */ - - ret = mv88e6060_setup_global(ds); - if (ret < 0) - return ret; - - for (i = 0; i < 6; i++) { - ret = mv88e6060_setup_port(ds, i); - if (ret < 0) - return ret; - } - - return 0; -} - -static int mv88e6060_set_addr(struct dsa_switch *ds, u8 *addr) -{ - REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]); - REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]); - REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]); - - return 0; -} - -static int mv88e6060_port_to_phy_addr(int port) -{ - if (port >= 0 && port <= 5) - return port; - return -1; -} - -static int mv88e6060_phy_read(struct dsa_switch *ds, int port, int regnum) -{ - int addr; - - addr = mv88e6060_port_to_phy_addr(port); - if (addr == -1) - return 0xffff; - - return reg_read(ds, addr, regnum); -} - -static int -mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) -{ - int addr; - - addr = mv88e6060_port_to_phy_addr(port); - if (addr == -1) - return 0xffff; - - return reg_write(ds, addr, regnum, val); -} - -static void mv88e6060_poll_link(struct dsa_switch *ds) -{ - int i; - - for (i = 0; i < DSA_MAX_PORTS; i++) { - struct net_device *dev; - int uninitialized_var(port_status); - int link; - int speed; - int duplex; - int fc; - - dev = ds->ports[i]; - if (dev == NULL) - continue; - - link = 0; - if (dev->flags & IFF_UP) { - port_status = reg_read(ds, REG_PORT(i), 0x00); - if (port_status < 0) - continue; - - link = !!(port_status & 0x1000); - } - - if (!link) { - if (netif_carrier_ok(dev)) { - printk(KERN_INFO "%s: link down\n", dev->name); - netif_carrier_off(dev); - } - continue; - } - - speed = (port_status & 0x0100) ? 100 : 10; - duplex = (port_status & 0x0200) ? 1 : 0; - fc = ((port_status & 0xc000) == 0xc000) ? 1 : 0; - - if (!netif_carrier_ok(dev)) { - printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, " - "flow control %sabled\n", dev->name, - speed, duplex ? "full" : "half", - fc ? "en" : "dis"); - netif_carrier_on(dev); - } - } -} - -static struct dsa_switch_driver mv88e6060_switch_driver = { - .tag_protocol = htons(ETH_P_TRAILER), - .probe = mv88e6060_probe, - .setup = mv88e6060_setup, - .set_addr = mv88e6060_set_addr, - .phy_read = mv88e6060_phy_read, - .phy_write = mv88e6060_phy_write, - .poll_link = mv88e6060_poll_link, -}; - -static int __init mv88e6060_init(void) -{ - register_switch_driver(&mv88e6060_switch_driver); - return 0; -} -module_init(mv88e6060_init); - -static void __exit mv88e6060_cleanup(void) -{ - unregister_switch_driver(&mv88e6060_switch_driver); -} -module_exit(mv88e6060_cleanup); diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c deleted file mode 100644 index 52faaa21a4d9..000000000000 --- a/net/dsa/mv88e6123_61_65.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support - * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/phy.h> -#include "dsa_priv.h" -#include "mv88e6xxx.h" - -static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr) -{ - int ret; - - ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); - if (ret >= 0) { - ret &= 0xfff0; - if (ret == 0x1210) - return "Marvell 88E6123"; - if (ret == 0x1610) - return "Marvell 88E6161"; - if (ret == 0x1650) - return "Marvell 88E6165"; - } - - return NULL; -} - -static int mv88e6123_61_65_switch_reset(struct dsa_switch *ds) -{ - int i; - int ret; - - /* - * Set all ports to the disabled state. - */ - for (i = 0; i < 8; i++) { - ret = REG_READ(REG_PORT(i), 0x04); - REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); - } - - /* - * Wait for transmit queues to drain. - */ - msleep(2); - - /* - * Reset the switch. - */ - REG_WRITE(REG_GLOBAL, 0x04, 0xc400); - - /* - * Wait up to one second for reset to complete. - */ - for (i = 0; i < 1000; i++) { - ret = REG_READ(REG_GLOBAL, 0x00); - if ((ret & 0xc800) == 0xc800) - break; - - msleep(1); - } - if (i == 1000) - return -ETIMEDOUT; - - return 0; -} - -static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) -{ - int ret; - int i; - - /* - * Disable the PHY polling unit (since there won't be any - * external PHYs to poll), don't discard packets with - * excessive collisions, and mask all interrupt sources. - */ - REG_WRITE(REG_GLOBAL, 0x04, 0x0000); - - /* - * Set the default address aging time to 5 minutes, and - * enable address learn messages to be sent to all message - * ports. - */ - REG_WRITE(REG_GLOBAL, 0x0a, 0x0148); - - /* - * Configure the priority mapping registers. - */ - ret = mv88e6xxx_config_prio(ds); - if (ret < 0) - return ret; - - /* - * Configure the upstream port, and configure the upstream - * port as the port to which ingress and egress monitor frames - * are to be sent. - */ - REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110)); - - /* - * Disable remote management for now, and set the switch's - * DSA device number. - */ - REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f); - - /* - * Send all frames with destination addresses matching - * 01:80:c2:00:00:2x to the CPU port. - */ - REG_WRITE(REG_GLOBAL2, 0x02, 0xffff); - - /* - * Send all frames with destination addresses matching - * 01:80:c2:00:00:0x to the CPU port. - */ - REG_WRITE(REG_GLOBAL2, 0x03, 0xffff); - - /* - * Disable the loopback filter, disable flow control - * messages, disable flood broadcast override, disable - * removing of provider tags, disable ATU age violation - * interrupts, disable tag flow control, force flow - * control priority to the highest, and send all special - * multicast frames to the CPU at the highest priority. - */ - REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); - - /* - * Program the DSA routing table. - */ - for (i = 0; i < 32; i++) { - int nexthop; - - nexthop = 0x1f; - if (i != ds->index && i < ds->dst->pd->nr_chips) - nexthop = ds->pd->rtable[i] & 0x1f; - - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); - } - - /* - * Clear all trunk masks. - */ - for (i = 0; i < 8; i++) - REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff); - - /* - * Clear all trunk mappings. - */ - for (i = 0; i < 16; i++) - REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11)); - - /* - * Disable ingress rate limiting by resetting all ingress - * rate limit registers to their initial state. - */ - for (i = 0; i < 6; i++) - REG_WRITE(REG_GLOBAL2, 0x09, 0x9000 | (i << 8)); - - /* - * Initialise cross-chip port VLAN table to reset defaults. - */ - REG_WRITE(REG_GLOBAL2, 0x0b, 0x9000); - - /* - * Clear the priority override table. - */ - for (i = 0; i < 16; i++) - REG_WRITE(REG_GLOBAL2, 0x0f, 0x8000 | (i << 8)); - - /* @@@ initialise AVB (22/23) watchdog (27) sdet (29) registers */ - - return 0; -} - -static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) -{ - int addr = REG_PORT(p); - u16 val; - - /* - * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values on physical - * ports, but force the CPU port and all DSA ports to 1000 Mb/s - * full duplex. - */ - if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) - REG_WRITE(addr, 0x01, 0x003e); - else - REG_WRITE(addr, 0x01, 0x0003); - - /* - * Do not limit the period of time that this port can be - * paused for by the remote end or the period of time that - * this port can pause the remote end. - */ - REG_WRITE(addr, 0x02, 0x0000); - - /* - * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock, - * disable Header mode, enable IGMP/MLD snooping, disable VLAN - * tunneling, determine priority by looking at 802.1p and IP - * priority fields (IP prio has precedence), and set STP state - * to Forwarding. - * - * If this is the CPU link, use DSA or EDSA tagging depending - * on which tagging mode was configured. - * - * If this is a link to another switch, use DSA tagging mode. - * - * If this is the upstream port for this switch, enable - * forwarding of unknown unicasts and multicasts. - */ - val = 0x0433; - if (dsa_is_cpu_port(ds, p)) { - if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) - val |= 0x3300; - else - val |= 0x0100; - } - if (ds->dsa_port_mask & (1 << p)) - val |= 0x0100; - if (p == dsa_upstream_port(ds)) - val |= 0x000c; - REG_WRITE(addr, 0x04, val); - - /* - * Port Control 1: disable trunking. Also, if this is the - * CPU port, enable learn messages to be sent to this port. - */ - REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); - - /* - * Port based VLAN map: give each port its own address - * database, allow the CPU port to talk to each of the 'real' - * ports, and allow each of the 'real' ports to only talk to - * the upstream port. - */ - val = (p & 0xf) << 12; - if (dsa_is_cpu_port(ds, p)) - val |= ds->phys_port_mask; - else - val |= 1 << dsa_upstream_port(ds); - REG_WRITE(addr, 0x06, val); - - /* - * Default VLAN ID and priority: don't set a default VLAN - * ID, and set the default packet priority to zero. - */ - REG_WRITE(addr, 0x07, 0x0000); - - /* - * Port Control 2: don't force a good FCS, set the maximum - * frame size to 10240 bytes, don't let the switch add or - * strip 802.1q tags, don't discard tagged or untagged frames - * on this port, do a destination address lookup on all - * received packets as usual, disable ARP mirroring and don't - * send a copy of all transmitted/received frames on this port - * to the CPU. - */ - REG_WRITE(addr, 0x08, 0x2080); - - /* - * Egress rate control: disable egress rate control. - */ - REG_WRITE(addr, 0x09, 0x0001); - - /* - * Egress rate control 2: disable egress rate control. - */ - REG_WRITE(addr, 0x0a, 0x0000); - - /* - * Port Association Vector: when learning source addresses - * of packets, add the address to the address database using - * a port bitmap that has only the bit for this port set and - * the other bits clear. - */ - REG_WRITE(addr, 0x0b, 1 << p); - - /* - * Port ATU control: disable limiting the number of address - * database entries that this port is allowed to use. - */ - REG_WRITE(addr, 0x0c, 0x0000); - - /* - * Priorit Override: disable DA, SA and VTU priority override. - */ - REG_WRITE(addr, 0x0d, 0x0000); - - /* - * Port Ethertype: use the Ethertype DSA Ethertype value. - */ - REG_WRITE(addr, 0x0f, ETH_P_EDSA); - - /* - * Tag Remap: use an identity 802.1p prio -> switch prio - * mapping. - */ - REG_WRITE(addr, 0x18, 0x3210); - - /* - * Tag Remap 2: use an identity 802.1p prio -> switch prio - * mapping. - */ - REG_WRITE(addr, 0x19, 0x7654); - - return 0; -} - -static int mv88e6123_61_65_setup(struct dsa_switch *ds) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int i; - int ret; - - mutex_init(&ps->smi_mutex); - mutex_init(&ps->stats_mutex); - - ret = mv88e6123_61_65_switch_reset(ds); - if (ret < 0) - return ret; - - /* @@@ initialise vtu and atu */ - - ret = mv88e6123_61_65_setup_global(ds); - if (ret < 0) - return ret; - - for (i = 0; i < 6; i++) { - ret = mv88e6123_61_65_setup_port(ds, i); - if (ret < 0) - return ret; - } - - return 0; -} - -static int mv88e6123_61_65_port_to_phy_addr(int port) -{ - if (port >= 0 && port <= 4) - return port; - return -1; -} - -static int -mv88e6123_61_65_phy_read(struct dsa_switch *ds, int port, int regnum) -{ - int addr = mv88e6123_61_65_port_to_phy_addr(port); - return mv88e6xxx_phy_read(ds, addr, regnum); -} - -static int -mv88e6123_61_65_phy_write(struct dsa_switch *ds, - int port, int regnum, u16 val) -{ - int addr = mv88e6123_61_65_port_to_phy_addr(port); - return mv88e6xxx_phy_write(ds, addr, regnum, val); -} - -static struct mv88e6xxx_hw_stat mv88e6123_61_65_hw_stats[] = { - { "in_good_octets", 8, 0x00, }, - { "in_bad_octets", 4, 0x02, }, - { "in_unicast", 4, 0x04, }, - { "in_broadcasts", 4, 0x06, }, - { "in_multicasts", 4, 0x07, }, - { "in_pause", 4, 0x16, }, - { "in_undersize", 4, 0x18, }, - { "in_fragments", 4, 0x19, }, - { "in_oversize", 4, 0x1a, }, - { "in_jabber", 4, 0x1b, }, - { "in_rx_error", 4, 0x1c, }, - { "in_fcs_error", 4, 0x1d, }, - { "out_octets", 8, 0x0e, }, - { "out_unicast", 4, 0x10, }, - { "out_broadcasts", 4, 0x13, }, - { "out_multicasts", 4, 0x12, }, - { "out_pause", 4, 0x15, }, - { "excessive", 4, 0x11, }, - { "collisions", 4, 0x1e, }, - { "deferred", 4, 0x05, }, - { "single", 4, 0x14, }, - { "multiple", 4, 0x17, }, - { "out_fcs_error", 4, 0x03, }, - { "late", 4, 0x1f, }, - { "hist_64bytes", 4, 0x08, }, - { "hist_65_127bytes", 4, 0x09, }, - { "hist_128_255bytes", 4, 0x0a, }, - { "hist_256_511bytes", 4, 0x0b, }, - { "hist_512_1023bytes", 4, 0x0c, }, - { "hist_1024_max_bytes", 4, 0x0d, }, -}; - -static void -mv88e6123_61_65_get_strings(struct dsa_switch *ds, int port, uint8_t *data) -{ - mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats), - mv88e6123_61_65_hw_stats, port, data); -} - -static void -mv88e6123_61_65_get_ethtool_stats(struct dsa_switch *ds, - int port, uint64_t *data) -{ - mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats), - mv88e6123_61_65_hw_stats, port, data); -} - -static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds) -{ - return ARRAY_SIZE(mv88e6123_61_65_hw_stats); -} - -static struct dsa_switch_driver mv88e6123_61_65_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_EDSA), - .priv_size = sizeof(struct mv88e6xxx_priv_state), - .probe = mv88e6123_61_65_probe, - .setup = mv88e6123_61_65_setup, - .set_addr = mv88e6xxx_set_addr_indirect, - .phy_read = mv88e6123_61_65_phy_read, - .phy_write = mv88e6123_61_65_phy_write, - .poll_link = mv88e6xxx_poll_link, - .get_strings = mv88e6123_61_65_get_strings, - .get_ethtool_stats = mv88e6123_61_65_get_ethtool_stats, - .get_sset_count = mv88e6123_61_65_get_sset_count, -}; - -static int __init mv88e6123_61_65_init(void) -{ - register_switch_driver(&mv88e6123_61_65_switch_driver); - return 0; -} -module_init(mv88e6123_61_65_init); - -static void __exit mv88e6123_61_65_cleanup(void) -{ - unregister_switch_driver(&mv88e6123_61_65_switch_driver); -} -module_exit(mv88e6123_61_65_cleanup); diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c deleted file mode 100644 index 9bd1061fa4ee..000000000000 --- a/net/dsa/mv88e6131.c +++ /dev/null @@ -1,443 +0,0 @@ -/* - * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support - * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/phy.h> -#include "dsa_priv.h" -#include "mv88e6xxx.h" - -/* - * Switch product IDs - */ -#define ID_6085 0x04a0 -#define ID_6095 0x0950 -#define ID_6131 0x1060 - -static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) -{ - int ret; - - ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); - if (ret >= 0) { - ret &= 0xfff0; - if (ret == ID_6085) - return "Marvell 88E6085"; - if (ret == ID_6095) - return "Marvell 88E6095/88E6095F"; - if (ret == ID_6131) - return "Marvell 88E6131"; - } - - return NULL; -} - -static int mv88e6131_switch_reset(struct dsa_switch *ds) -{ - int i; - int ret; - - /* - * Set all ports to the disabled state. - */ - for (i = 0; i < 11; i++) { - ret = REG_READ(REG_PORT(i), 0x04); - REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); - } - - /* - * Wait for transmit queues to drain. - */ - msleep(2); - - /* - * Reset the switch. - */ - REG_WRITE(REG_GLOBAL, 0x04, 0xc400); - - /* - * Wait up to one second for reset to complete. - */ - for (i = 0; i < 1000; i++) { - ret = REG_READ(REG_GLOBAL, 0x00); - if ((ret & 0xc800) == 0xc800) - break; - - msleep(1); - } - if (i == 1000) - return -ETIMEDOUT; - - return 0; -} - -static int mv88e6131_setup_global(struct dsa_switch *ds) -{ - int ret; - int i; - - /* - * Enable the PHY polling unit, don't discard packets with - * excessive collisions, use a weighted fair queueing scheme - * to arbitrate between packet queues, set the maximum frame - * size to 1632, and mask all interrupt sources. - */ - REG_WRITE(REG_GLOBAL, 0x04, 0x4400); - - /* - * Set the default address aging time to 5 minutes, and - * enable address learn messages to be sent to all message - * ports. - */ - REG_WRITE(REG_GLOBAL, 0x0a, 0x0148); - - /* - * Configure the priority mapping registers. - */ - ret = mv88e6xxx_config_prio(ds); - if (ret < 0) - return ret; - - /* - * Set the VLAN ethertype to 0x8100. - */ - REG_WRITE(REG_GLOBAL, 0x19, 0x8100); - - /* - * Disable ARP mirroring, and configure the upstream port as - * the port to which ingress and egress monitor frames are to - * be sent. - */ - REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0); - - /* - * Disable cascade port functionality unless this device - * is used in a cascade configuration, and set the switch's - * DSA device number. - */ - if (ds->dst->pd->nr_chips > 1) - REG_WRITE(REG_GLOBAL, 0x1c, 0xf000 | (ds->index & 0x1f)); - else - REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f)); - - /* - * Send all frames with destination addresses matching - * 01:80:c2:00:00:0x to the CPU port. - */ - REG_WRITE(REG_GLOBAL2, 0x03, 0xffff); - - /* - * Ignore removed tag data on doubly tagged packets, disable - * flow control messages, force flow control priority to the - * highest, and send all special multicast frames to the CPU - * port at the highest priority. - */ - REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); - - /* - * Program the DSA routing table. - */ - for (i = 0; i < 32; i++) { - int nexthop; - - nexthop = 0x1f; - if (i != ds->index && i < ds->dst->pd->nr_chips) - nexthop = ds->pd->rtable[i] & 0x1f; - - REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); - } - - /* - * Clear all trunk masks. - */ - for (i = 0; i < 8; i++) - REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff); - - /* - * Clear all trunk mappings. - */ - for (i = 0; i < 16; i++) - REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11)); - - /* - * Force the priority of IGMP/MLD snoop frames and ARP frames - * to the highest setting. - */ - REG_WRITE(REG_GLOBAL2, 0x0f, 0x00ff); - - return 0; -} - -static int mv88e6131_setup_port(struct dsa_switch *ds, int p) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int addr = REG_PORT(p); - u16 val; - - /* - * MAC Forcing register: don't force link, speed, duplex - * or flow control state to any particular values on physical - * ports, but force the CPU port and all DSA ports to 1000 Mb/s - * (100 Mb/s on 6085) full duplex. - */ - if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) - if (ps->id == ID_6085) - REG_WRITE(addr, 0x01, 0x003d); /* 100 Mb/s */ - else - REG_WRITE(addr, 0x01, 0x003e); /* 1000 Mb/s */ - else - REG_WRITE(addr, 0x01, 0x0003); - - /* - * Port Control: disable Core Tag, disable Drop-on-Lock, - * transmit frames unmodified, disable Header mode, - * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN - * tunneling, determine priority by looking at 802.1p and - * IP priority fields (IP prio has precedence), and set STP - * state to Forwarding. - * - * If this is the upstream port for this switch, enable - * forwarding of unknown unicasts, and enable DSA tagging - * mode. - * - * If this is the link to another switch, use DSA tagging - * mode, but do not enable forwarding of unknown unicasts. - */ - val = 0x0433; - if (p == dsa_upstream_port(ds)) { - val |= 0x0104; - /* - * On 6085, unknown multicast forward is controlled - * here rather than in Port Control 2 register. - */ - if (ps->id == ID_6085) - val |= 0x0008; - } - if (ds->dsa_port_mask & (1 << p)) - val |= 0x0100; - REG_WRITE(addr, 0x04, val); - - /* - * Port Control 1: disable trunking. Also, if this is the - * CPU port, enable learn messages to be sent to this port. - */ - REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); - - /* - * Port based VLAN map: give each port its own address - * database, allow the CPU port to talk to each of the 'real' - * ports, and allow each of the 'real' ports to only talk to - * the upstream port. - */ - val = (p & 0xf) << 12; - if (dsa_is_cpu_port(ds, p)) - val |= ds->phys_port_mask; - else - val |= 1 << dsa_upstream_port(ds); - REG_WRITE(addr, 0x06, val); - - /* - * Default VLAN ID and priority: don't set a default VLAN - * ID, and set the default packet priority to zero. - */ - REG_WRITE(addr, 0x07, 0x0000); - - /* - * Port Control 2: don't force a good FCS, don't use - * VLAN-based, source address-based or destination - * address-based priority overrides, don't let the switch - * add or strip 802.1q tags, don't discard tagged or - * untagged frames on this port, do a destination address - * lookup on received packets as usual, don't send a copy - * of all transmitted/received frames on this port to the - * CPU, and configure the upstream port number. - * - * If this is the upstream port for this switch, enable - * forwarding of unknown multicast addresses. - */ - if (ps->id == ID_6085) - /* - * on 6085, bits 3:0 are reserved, bit 6 control ARP - * mirroring, and multicast forward is handled in - * Port Control register. - */ - REG_WRITE(addr, 0x08, 0x0080); - else { - val = 0x0080 | dsa_upstream_port(ds); - if (p == dsa_upstream_port(ds)) - val |= 0x0040; - REG_WRITE(addr, 0x08, val); - } - - /* - * Rate Control: disable ingress rate limiting. - */ - REG_WRITE(addr, 0x09, 0x0000); - - /* - * Rate Control 2: disable egress rate limiting. - */ - REG_WRITE(addr, 0x0a, 0x0000); - - /* - * Port Association Vector: when learning source addresses - * of packets, add the address to the address database using - * a port bitmap that has only the bit for this port set and - * the other bits clear. - */ - REG_WRITE(addr, 0x0b, 1 << p); - - /* - * Tag Remap: use an identity 802.1p prio -> switch prio - * mapping. - */ - REG_WRITE(addr, 0x18, 0x3210); - - /* - * Tag Remap 2: use an identity 802.1p prio -> switch prio - * mapping. - */ - REG_WRITE(addr, 0x19, 0x7654); - - return 0; -} - -static int mv88e6131_setup(struct dsa_switch *ds) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int i; - int ret; - - mutex_init(&ps->smi_mutex); - mv88e6xxx_ppu_state_init(ds); - mutex_init(&ps->stats_mutex); - - ps->id = REG_READ(REG_PORT(0), 0x03) & 0xfff0; - - ret = mv88e6131_switch_reset(ds); - if (ret < 0) - return ret; - - /* @@@ initialise vtu and atu */ - - ret = mv88e6131_setup_global(ds); - if (ret < 0) - return ret; - - for (i = 0; i < 11; i++) { - ret = mv88e6131_setup_port(ds, i); - if (ret < 0) - return ret; - } - - return 0; -} - -static int mv88e6131_port_to_phy_addr(int port) -{ - if (port >= 0 && port <= 11) - return port; - return -1; -} - -static int -mv88e6131_phy_read(struct dsa_switch *ds, int port, int regnum) -{ - int addr = mv88e6131_port_to_phy_addr(port); - return mv88e6xxx_phy_read_ppu(ds, addr, regnum); -} - -static int -mv88e6131_phy_write(struct dsa_switch *ds, - int port, int regnum, u16 val) -{ - int addr = mv88e6131_port_to_phy_addr(port); - return mv88e6xxx_phy_write_ppu(ds, addr, regnum, val); -} - -static struct mv88e6xxx_hw_stat mv88e6131_hw_stats[] = { - { "in_good_octets", 8, 0x00, }, - { "in_bad_octets", 4, 0x02, }, - { "in_unicast", 4, 0x04, }, - { "in_broadcasts", 4, 0x06, }, - { "in_multicasts", 4, 0x07, }, - { "in_pause", 4, 0x16, }, - { "in_undersize", 4, 0x18, }, - { "in_fragments", 4, 0x19, }, - { "in_oversize", 4, 0x1a, }, - { "in_jabber", 4, 0x1b, }, - { "in_rx_error", 4, 0x1c, }, - { "in_fcs_error", 4, 0x1d, }, - { "out_octets", 8, 0x0e, }, - { "out_unicast", 4, 0x10, }, - { "out_broadcasts", 4, 0x13, }, - { "out_multicasts", 4, 0x12, }, - { "out_pause", 4, 0x15, }, - { "excessive", 4, 0x11, }, - { "collisions", 4, 0x1e, }, - { "deferred", 4, 0x05, }, - { "single", 4, 0x14, }, - { "multiple", 4, 0x17, }, - { "out_fcs_error", 4, 0x03, }, - { "late", 4, 0x1f, }, - { "hist_64bytes", 4, 0x08, }, - { "hist_65_127bytes", 4, 0x09, }, - { "hist_128_255bytes", 4, 0x0a, }, - { "hist_256_511bytes", 4, 0x0b, }, - { "hist_512_1023bytes", 4, 0x0c, }, - { "hist_1024_max_bytes", 4, 0x0d, }, -}; - -static void -mv88e6131_get_strings(struct dsa_switch *ds, int port, uint8_t *data) -{ - mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6131_hw_stats), - mv88e6131_hw_stats, port, data); -} - -static void -mv88e6131_get_ethtool_stats(struct dsa_switch *ds, - int port, uint64_t *data) -{ - mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6131_hw_stats), - mv88e6131_hw_stats, port, data); -} - -static int mv88e6131_get_sset_count(struct dsa_switch *ds) -{ - return ARRAY_SIZE(mv88e6131_hw_stats); -} - -static struct dsa_switch_driver mv88e6131_switch_driver = { - .tag_protocol = cpu_to_be16(ETH_P_DSA), - .priv_size = sizeof(struct mv88e6xxx_priv_state), - .probe = mv88e6131_probe, - .setup = mv88e6131_setup, - .set_addr = mv88e6xxx_set_addr_direct, - .phy_read = mv88e6131_phy_read, - .phy_write = mv88e6131_phy_write, - .poll_link = mv88e6xxx_poll_link, - .get_strings = mv88e6131_get_strings, - .get_ethtool_stats = mv88e6131_get_ethtool_stats, - .get_sset_count = mv88e6131_get_sset_count, -}; - -static int __init mv88e6131_init(void) -{ - register_switch_driver(&mv88e6131_switch_driver); - return 0; -} -module_init(mv88e6131_init); - -static void __exit mv88e6131_cleanup(void) -{ - unregister_switch_driver(&mv88e6131_switch_driver); -} -module_exit(mv88e6131_cleanup); diff --git a/net/dsa/mv88e6xxx.c b/net/dsa/mv88e6xxx.c deleted file mode 100644 index efe661a9def4..000000000000 --- a/net/dsa/mv88e6xxx.c +++ /dev/null @@ -1,522 +0,0 @@ -/* - * net/dsa/mv88e6xxx.c - Marvell 88e6xxx switch chip support - * Copyright (c) 2008 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/phy.h> -#include "dsa_priv.h" -#include "mv88e6xxx.h" - -/* - * If the switch's ADDR[4:0] strap pins are strapped to zero, it will - * use all 32 SMI bus addresses on its SMI bus, and all switch registers - * will be directly accessible on some {device address,register address} - * pair. If the ADDR[4:0] pins are not strapped to zero, the switch - * will only respond to SMI transactions to that specific address, and - * an indirect addressing mechanism needs to be used to access its - * registers. - */ -static int mv88e6xxx_reg_wait_ready(struct mii_bus *bus, int sw_addr) -{ - int ret; - int i; - - for (i = 0; i < 16; i++) { - ret = mdiobus_read(bus, sw_addr, 0); - if (ret < 0) - return ret; - - if ((ret & 0x8000) == 0) - return 0; - } - - return -ETIMEDOUT; -} - -int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg) -{ - int ret; - - if (sw_addr == 0) - return mdiobus_read(bus, addr, reg); - - /* - * Wait for the bus to become free. - */ - ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); - if (ret < 0) - return ret; - - /* - * Transmit the read command. - */ - ret = mdiobus_write(bus, sw_addr, 0, 0x9800 | (addr << 5) | reg); - if (ret < 0) - return ret; - - /* - * Wait for the read command to complete. - */ - ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); - if (ret < 0) - return ret; - - /* - * Read the data. - */ - ret = mdiobus_read(bus, sw_addr, 1); - if (ret < 0) - return ret; - - return ret & 0xffff; -} - -int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int ret; - - mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_read(ds->master_mii_bus, - ds->pd->sw_addr, addr, reg); - mutex_unlock(&ps->smi_mutex); - - return ret; -} - -int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr, - int reg, u16 val) -{ - int ret; - - if (sw_addr == 0) - return mdiobus_write(bus, addr, reg, val); - - /* - * Wait for the bus to become free. - */ - ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); - if (ret < 0) - return ret; - - /* - * Transmit the data to write. - */ - ret = mdiobus_write(bus, sw_addr, 1, val); - if (ret < 0) - return ret; - - /* - * Transmit the write command. - */ - ret = mdiobus_write(bus, sw_addr, 0, 0x9400 | (addr << 5) | reg); - if (ret < 0) - return ret; - - /* - * Wait for the write command to complete. - */ - ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); - if (ret < 0) - return ret; - - return 0; -} - -int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int ret; - - mutex_lock(&ps->smi_mutex); - ret = __mv88e6xxx_reg_write(ds->master_mii_bus, - ds->pd->sw_addr, addr, reg, val); - mutex_unlock(&ps->smi_mutex); - - return ret; -} - -int mv88e6xxx_config_prio(struct dsa_switch *ds) -{ - /* - * Configure the IP ToS mapping registers. - */ - REG_WRITE(REG_GLOBAL, 0x10, 0x0000); - REG_WRITE(REG_GLOBAL, 0x11, 0x0000); - REG_WRITE(REG_GLOBAL, 0x12, 0x5555); - REG_WRITE(REG_GLOBAL, 0x13, 0x5555); - REG_WRITE(REG_GLOBAL, 0x14, 0xaaaa); - REG_WRITE(REG_GLOBAL, 0x15, 0xaaaa); - REG_WRITE(REG_GLOBAL, 0x16, 0xffff); - REG_WRITE(REG_GLOBAL, 0x17, 0xffff); - - /* - * Configure the IEEE 802.1p priority mapping register. - */ - REG_WRITE(REG_GLOBAL, 0x18, 0xfa41); - - return 0; -} - -int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr) -{ - REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]); - REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]); - REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]); - - return 0; -} - -int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr) -{ - int i; - int ret; - - for (i = 0; i < 6; i++) { - int j; - - /* - * Write the MAC address byte. - */ - REG_WRITE(REG_GLOBAL2, 0x0d, 0x8000 | (i << 8) | addr[i]); - - /* - * Wait for the write to complete. - */ - for (j = 0; j < 16; j++) { - ret = REG_READ(REG_GLOBAL2, 0x0d); - if ((ret & 0x8000) == 0) - break; - } - if (j == 16) - return -ETIMEDOUT; - } - - return 0; -} - -int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum) -{ - if (addr >= 0) - return mv88e6xxx_reg_read(ds, addr, regnum); - return 0xffff; -} - -int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val) -{ - if (addr >= 0) - return mv88e6xxx_reg_write(ds, addr, regnum, val); - return 0; -} - -#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU -static int mv88e6xxx_ppu_disable(struct dsa_switch *ds) -{ - int ret; - int i; - - ret = REG_READ(REG_GLOBAL, 0x04); - REG_WRITE(REG_GLOBAL, 0x04, ret & ~0x4000); - - for (i = 0; i < 1000; i++) { - ret = REG_READ(REG_GLOBAL, 0x00); - msleep(1); - if ((ret & 0xc000) != 0xc000) - return 0; - } - - return -ETIMEDOUT; -} - -static int mv88e6xxx_ppu_enable(struct dsa_switch *ds) -{ - int ret; - int i; - - ret = REG_READ(REG_GLOBAL, 0x04); - REG_WRITE(REG_GLOBAL, 0x04, ret | 0x4000); - - for (i = 0; i < 1000; i++) { - ret = REG_READ(REG_GLOBAL, 0x00); - msleep(1); - if ((ret & 0xc000) == 0xc000) - return 0; - } - - return -ETIMEDOUT; -} - -static void mv88e6xxx_ppu_reenable_work(struct work_struct *ugly) -{ - struct mv88e6xxx_priv_state *ps; - - ps = container_of(ugly, struct mv88e6xxx_priv_state, ppu_work); - if (mutex_trylock(&ps->ppu_mutex)) { - struct dsa_switch *ds = ((struct dsa_switch *)ps) - 1; - - if (mv88e6xxx_ppu_enable(ds) == 0) - ps->ppu_disabled = 0; - mutex_unlock(&ps->ppu_mutex); - } -} - -static void mv88e6xxx_ppu_reenable_timer(unsigned long _ps) -{ - struct mv88e6xxx_priv_state *ps = (void *)_ps; - - schedule_work(&ps->ppu_work); -} - -static int mv88e6xxx_ppu_access_get(struct dsa_switch *ds) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int ret; - - mutex_lock(&ps->ppu_mutex); - - /* - * If the PHY polling unit is enabled, disable it so that - * we can access the PHY registers. If it was already - * disabled, cancel the timer that is going to re-enable - * it. - */ - if (!ps->ppu_disabled) { - ret = mv88e6xxx_ppu_disable(ds); - if (ret < 0) { - mutex_unlock(&ps->ppu_mutex); - return ret; - } - ps->ppu_disabled = 1; - } else { - del_timer(&ps->ppu_timer); - ret = 0; - } - - return ret; -} - -static void mv88e6xxx_ppu_access_put(struct dsa_switch *ds) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - - /* - * Schedule a timer to re-enable the PHY polling unit. - */ - mod_timer(&ps->ppu_timer, jiffies + msecs_to_jiffies(10)); - mutex_unlock(&ps->ppu_mutex); -} - -void mv88e6xxx_ppu_state_init(struct dsa_switch *ds) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - - mutex_init(&ps->ppu_mutex); - INIT_WORK(&ps->ppu_work, mv88e6xxx_ppu_reenable_work); - init_timer(&ps->ppu_timer); - ps->ppu_timer.data = (unsigned long)ps; - ps->ppu_timer.function = mv88e6xxx_ppu_reenable_timer; -} - -int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum) -{ - int ret; - - ret = mv88e6xxx_ppu_access_get(ds); - if (ret >= 0) { - ret = mv88e6xxx_reg_read(ds, addr, regnum); - mv88e6xxx_ppu_access_put(ds); - } - - return ret; -} - -int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr, - int regnum, u16 val) -{ - int ret; - - ret = mv88e6xxx_ppu_access_get(ds); - if (ret >= 0) { - ret = mv88e6xxx_reg_write(ds, addr, regnum, val); - mv88e6xxx_ppu_access_put(ds); - } - - return ret; -} -#endif - -void mv88e6xxx_poll_link(struct dsa_switch *ds) -{ - int i; - - for (i = 0; i < DSA_MAX_PORTS; i++) { - struct net_device *dev; - int uninitialized_var(port_status); - int link; - int speed; - int duplex; - int fc; - - dev = ds->ports[i]; - if (dev == NULL) - continue; - - link = 0; - if (dev->flags & IFF_UP) { - port_status = mv88e6xxx_reg_read(ds, REG_PORT(i), 0x00); - if (port_status < 0) - continue; - - link = !!(port_status & 0x0800); - } - - if (!link) { - if (netif_carrier_ok(dev)) { - printk(KERN_INFO "%s: link down\n", dev->name); - netif_carrier_off(dev); - } - continue; - } - - switch (port_status & 0x0300) { - case 0x0000: - speed = 10; - break; - case 0x0100: - speed = 100; - break; - case 0x0200: - speed = 1000; - break; - default: - speed = -1; - break; - } - duplex = (port_status & 0x0400) ? 1 : 0; - fc = (port_status & 0x8000) ? 1 : 0; - - if (!netif_carrier_ok(dev)) { - printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, " - "flow control %sabled\n", dev->name, - speed, duplex ? "full" : "half", - fc ? "en" : "dis"); - netif_carrier_on(dev); - } - } -} - -static int mv88e6xxx_stats_wait(struct dsa_switch *ds) -{ - int ret; - int i; - - for (i = 0; i < 10; i++) { - ret = REG_READ(REG_GLOBAL, 0x1d); - if ((ret & 0x8000) == 0) - return 0; - } - - return -ETIMEDOUT; -} - -static int mv88e6xxx_stats_snapshot(struct dsa_switch *ds, int port) -{ - int ret; - - /* - * Snapshot the hardware statistics counters for this port. - */ - REG_WRITE(REG_GLOBAL, 0x1d, 0xdc00 | port); - - /* - * Wait for the snapshotting to complete. - */ - ret = mv88e6xxx_stats_wait(ds); - if (ret < 0) - return ret; - - return 0; -} - -static void mv88e6xxx_stats_read(struct dsa_switch *ds, int stat, u32 *val) -{ - u32 _val; - int ret; - - *val = 0; - - ret = mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x1d, 0xcc00 | stat); - if (ret < 0) - return; - - ret = mv88e6xxx_stats_wait(ds); - if (ret < 0) - return; - - ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1e); - if (ret < 0) - return; - - _val = ret << 16; - - ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1f); - if (ret < 0) - return; - - *val = _val | ret; -} - -void mv88e6xxx_get_strings(struct dsa_switch *ds, - int nr_stats, struct mv88e6xxx_hw_stat *stats, - int port, uint8_t *data) -{ - int i; - - for (i = 0; i < nr_stats; i++) { - memcpy(data + i * ETH_GSTRING_LEN, - stats[i].string, ETH_GSTRING_LEN); - } -} - -void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, - int nr_stats, struct mv88e6xxx_hw_stat *stats, - int port, uint64_t *data) -{ - struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); - int ret; - int i; - - mutex_lock(&ps->stats_mutex); - - ret = mv88e6xxx_stats_snapshot(ds, port); - if (ret < 0) { - mutex_unlock(&ps->stats_mutex); - return; - } - - /* - * Read each of the counters. - */ - for (i = 0; i < nr_stats; i++) { - struct mv88e6xxx_hw_stat *s = stats + i; - u32 low; - u32 high; - - mv88e6xxx_stats_read(ds, s->reg, &low); - if (s->sizeof_stat == 8) - mv88e6xxx_stats_read(ds, s->reg + 1, &high); - else - high = 0; - - data[i] = (((u64)high) << 32) | low; - } - - mutex_unlock(&ps->stats_mutex); -} diff --git a/net/dsa/mv88e6xxx.h b/net/dsa/mv88e6xxx.h deleted file mode 100644 index 61156ca26a0d..000000000000 --- a/net/dsa/mv88e6xxx.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * net/dsa/mv88e6xxx.h - Marvell 88e6xxx switch chip support - * Copyright (c) 2008 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __MV88E6XXX_H -#define __MV88E6XXX_H - -#define REG_PORT(p) (0x10 + (p)) -#define REG_GLOBAL 0x1b -#define REG_GLOBAL2 0x1c - -struct mv88e6xxx_priv_state { - /* - * When using multi-chip addressing, this mutex protects - * access to the indirect access registers. (In single-chip - * mode, this mutex is effectively useless.) - */ - struct mutex smi_mutex; - -#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU - /* - * Handles automatic disabling and re-enabling of the PHY - * polling unit. - */ - struct mutex ppu_mutex; - int ppu_disabled; - struct work_struct ppu_work; - struct timer_list ppu_timer; -#endif - - /* - * This mutex serialises access to the statistics unit. - * Hold this mutex over snapshot + dump sequences. - */ - struct mutex stats_mutex; - - int id; /* switch product id */ -}; - -struct mv88e6xxx_hw_stat { - char string[ETH_GSTRING_LEN]; - int sizeof_stat; - int reg; -}; - -int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg); -int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg); -int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr, - int reg, u16 val); -int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val); -int mv88e6xxx_config_prio(struct dsa_switch *ds); -int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr); -int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr); -int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum); -int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val); -void mv88e6xxx_ppu_state_init(struct dsa_switch *ds); -int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum); -int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr, - int regnum, u16 val); -void mv88e6xxx_poll_link(struct dsa_switch *ds); -void mv88e6xxx_get_strings(struct dsa_switch *ds, - int nr_stats, struct mv88e6xxx_hw_stat *stats, - int port, uint8_t *data); -void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, - int nr_stats, struct mv88e6xxx_hw_stat *stats, - int port, uint64_t *data); - -#define REG_READ(addr, reg) \ - ({ \ - int __ret; \ - \ - __ret = mv88e6xxx_reg_read(ds, addr, reg); \ - if (__ret < 0) \ - return __ret; \ - __ret; \ - }) - -#define REG_WRITE(addr, reg, val) \ - ({ \ - int __ret; \ - \ - __ret = mv88e6xxx_reg_write(ds, addr, reg, val); \ - if (__ret < 0) \ - return __ret; \ - }) - - - -#endif diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 98dfe80b4538..cacce1e22f9c 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -186,20 +186,7 @@ out: return 0; } -static struct packet_type dsa_packet_type __read_mostly = { +struct packet_type dsa_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DSA), .func = dsa_rcv, }; - -static int __init dsa_init_module(void) -{ - dev_add_pack(&dsa_packet_type); - return 0; -} -module_init(dsa_init_module); - -static void __exit dsa_cleanup_module(void) -{ - dev_remove_pack(&dsa_packet_type); -} -module_exit(dsa_cleanup_module); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6f383322ad25..e70c43c25e64 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -205,20 +205,7 @@ out: return 0; } -static struct packet_type edsa_packet_type __read_mostly = { +struct packet_type edsa_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_EDSA), .func = edsa_rcv, }; - -static int __init edsa_init_module(void) -{ - dev_add_pack(&edsa_packet_type); - return 0; -} -module_init(edsa_init_module); - -static void __exit edsa_cleanup_module(void) -{ - dev_remove_pack(&edsa_packet_type); -} -module_exit(edsa_cleanup_module); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d6d7d0add3cb..94bc260d015d 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -114,20 +114,7 @@ out: return 0; } -static struct packet_type trailer_packet_type __read_mostly = { +struct packet_type trailer_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_TRAILER), .func = trailer_rcv, }; - -static int __init trailer_init_module(void) -{ - dev_add_pack(&trailer_packet_type); - return 0; -} -module_init(trailer_init_module); - -static void __exit trailer_cleanup_module(void) -{ - dev_remove_pack(&trailer_packet_type); -} -module_exit(trailer_cleanup_module); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 1c1f26c5d672..7e717cb35ad1 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -322,6 +322,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, /* Real hardware Econet. We're not worthy etc. */ #ifdef CONFIG_ECONET_NATIVE unsigned short proto = 0; + int hlen, tlen; int res; if (len + 15 > dev->mtu) { @@ -331,12 +332,14 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, dev_hold(dev); - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + skb = sock_alloc_send_skb(sk, len + hlen + tlen, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); eb = (struct ec_cb *)&skb->cb; diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 19d6aefe97d4..e4ecc1eef98c 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -50,8 +50,6 @@ * SUCH DAMAGE. */ -#define DEBUG - #include <linux/bitops.h> #include <linux/if_arp.h> #include <linux/module.h> @@ -113,6 +111,20 @@ struct lowpan_dev_record { struct list_head list; }; +struct lowpan_fragment { + struct sk_buff *skb; /* skb to be assembled */ + spinlock_t lock; /* concurency lock */ + u16 length; /* length to be assemled */ + u32 bytes_rcv; /* bytes received */ + u16 tag; /* current fragment tag */ + struct timer_list timer; /* assembling timer */ + struct list_head list; /* fragments list */ +}; + +static unsigned short fragment_tag; +static LIST_HEAD(lowpan_fragments); +spinlock_t flist_lock; + static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) { @@ -234,6 +246,50 @@ lowpan_uncompress_addr(struct sk_buff *skb, struct in6_addr *ipaddr, return 0; } +static void +lowpan_compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + + pr_debug("(%s): UDP header compression\n", __func__); + + if (((uh->source & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT) && + ((uh->dest & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT)) { + pr_debug("(%s): both ports compression to 4 bits\n", __func__); + **hc06_ptr = LOWPAN_NHC_UDP_CS_P_11; + **(hc06_ptr + 1) = /* subtraction is faster */ + (u8)((uh->dest - LOWPAN_NHC_UDP_4BIT_PORT) + + ((uh->source & LOWPAN_NHC_UDP_4BIT_PORT) << 4)); + *hc06_ptr += 2; + } else if ((uh->dest & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("(%s): remove 8 bits of dest\n", __func__); + **hc06_ptr = LOWPAN_NHC_UDP_CS_P_01; + memcpy(*hc06_ptr + 1, &uh->source, 2); + **(hc06_ptr + 3) = (u8)(uh->dest - LOWPAN_NHC_UDP_8BIT_PORT); + *hc06_ptr += 4; + } else if ((uh->source & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("(%s): remove 8 bits of source\n", __func__); + **hc06_ptr = LOWPAN_NHC_UDP_CS_P_10; + memcpy(*hc06_ptr + 1, &uh->dest, 2); + **(hc06_ptr + 3) = (u8)(uh->source - LOWPAN_NHC_UDP_8BIT_PORT); + *hc06_ptr += 4; + } else { + pr_debug("(%s): can't compress header\n", __func__); + **hc06_ptr = LOWPAN_NHC_UDP_CS_P_00; + memcpy(*hc06_ptr + 1, &uh->source, 2); + memcpy(*hc06_ptr + 3, &uh->dest, 2); + *hc06_ptr += 5; + } + + /* checksum is always inline */ + memcpy(*hc06_ptr, &uh->check, 2); + *hc06_ptr += 2; +} + static u8 lowpan_fetch_skb_u8(struct sk_buff *skb) { u8 ret; @@ -244,6 +300,73 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb) return ret; } +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb) +{ + u16 ret; + + BUG_ON(!pskb_may_pull(skb, 2)); + + ret = skb->data[0] | (skb->data[1] << 8); + skb_pull(skb, 2); + return ret; +} + +static int +lowpan_uncompress_udp_header(struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + u8 tmp; + + tmp = lowpan_fetch_skb_u8(skb); + + if ((tmp & LOWPAN_NHC_UDP_MASK) == LOWPAN_NHC_UDP_ID) { + pr_debug("(%s): UDP header uncompression\n", __func__); + switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { + case LOWPAN_NHC_UDP_CS_P_00: + memcpy(&uh->source, &skb->data[0], 2); + memcpy(&uh->dest, &skb->data[2], 2); + skb_pull(skb, 4); + break; + case LOWPAN_NHC_UDP_CS_P_01: + memcpy(&uh->source, &skb->data[0], 2); + uh->dest = + skb->data[2] + LOWPAN_NHC_UDP_8BIT_PORT; + skb_pull(skb, 3); + break; + case LOWPAN_NHC_UDP_CS_P_10: + uh->source = skb->data[0] + LOWPAN_NHC_UDP_8BIT_PORT; + memcpy(&uh->dest, &skb->data[1], 2); + skb_pull(skb, 3); + break; + case LOWPAN_NHC_UDP_CS_P_11: + uh->source = + LOWPAN_NHC_UDP_4BIT_PORT + (skb->data[0] >> 4); + uh->dest = + LOWPAN_NHC_UDP_4BIT_PORT + (skb->data[0] & 0x0f); + skb_pull(skb, 1); + break; + default: + pr_debug("(%s) ERROR: unknown UDP format\n", __func__); + goto err; + break; + } + + pr_debug("(%s): uncompressed UDP ports: src = %d, dst = %d\n", + __func__, uh->source, uh->dest); + + /* copy checksum */ + memcpy(&uh->check, &skb->data[0], 2); + skb_pull(skb, 2); + } else { + pr_debug("(%s): ERROR: unsupported NH format\n", __func__); + goto err; + } + + return 0; +err: + return -EINVAL; +} + static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *_daddr, @@ -342,8 +465,6 @@ static int lowpan_header_create(struct sk_buff *skb, if (hdr->nexthdr == UIP_PROTO_UDP) iphc0 |= LOWPAN_IPHC_NH_C; -/* TODO: next header compression */ - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { *hc06_ptr = hdr->nexthdr; hc06_ptr += 1; @@ -431,8 +552,9 @@ static int lowpan_header_create(struct sk_buff *skb, } } - /* TODO: UDP header compression */ - /* TODO: Next Header compression */ + /* UDP header compression */ + if (hdr->nexthdr == UIP_PROTO_UDP) + lowpan_compress_udp_header(&hc06_ptr, skb); head[0] = iphc0; head[1] = iphc1; @@ -467,6 +589,7 @@ static int lowpan_header_create(struct sk_buff *skb, memcpy(&(sa.hwaddr), saddr, 8); mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; + return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, type, (void *)&da, (void *)&sa, skb->len); } @@ -511,6 +634,21 @@ static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr) return stat; } +static void lowpan_fragment_timer_expired(unsigned long entry_addr) +{ + struct lowpan_fragment *entry = (struct lowpan_fragment *)entry_addr; + + pr_debug("%s: timer expired for frame with tag %d\n", __func__, + entry->tag); + + spin_lock(&flist_lock); + list_del(&entry->list); + spin_unlock(&flist_lock); + + dev_kfree_skb(entry->skb); + kfree(entry); +} + static int lowpan_process_data(struct sk_buff *skb) { @@ -525,6 +663,107 @@ lowpan_process_data(struct sk_buff *skb) if (skb->len < 2) goto drop; iphc0 = lowpan_fetch_skb_u8(skb); + + /* fragments assembling */ + switch (iphc0 & LOWPAN_DISPATCH_MASK) { + case LOWPAN_DISPATCH_FRAG1: + case LOWPAN_DISPATCH_FRAGN: + { + struct lowpan_fragment *frame; + u8 len, offset; + u16 tag; + bool found = false; + + len = lowpan_fetch_skb_u8(skb); /* frame length */ + tag = lowpan_fetch_skb_u16(skb); + + /* + * check if frame assembling with the same tag is + * already in progress + */ + spin_lock(&flist_lock); + + list_for_each_entry(frame, &lowpan_fragments, list) + if (frame->tag == tag) { + found = true; + break; + } + + /* alloc new frame structure */ + if (!found) { + frame = kzalloc(sizeof(struct lowpan_fragment), + GFP_ATOMIC); + if (!frame) + goto unlock_and_drop; + + INIT_LIST_HEAD(&frame->list); + + frame->length = (iphc0 & 7) | (len << 3); + frame->tag = tag; + + /* allocate buffer for frame assembling */ + frame->skb = alloc_skb(frame->length + + sizeof(struct ipv6hdr), GFP_ATOMIC); + + if (!frame->skb) { + kfree(frame); + goto unlock_and_drop; + } + + frame->skb->priority = skb->priority; + frame->skb->dev = skb->dev; + + /* reserve headroom for uncompressed ipv6 header */ + skb_reserve(frame->skb, sizeof(struct ipv6hdr)); + skb_put(frame->skb, frame->length); + + init_timer(&frame->timer); + /* time out is the same as for ipv6 - 60 sec */ + frame->timer.expires = jiffies + LOWPAN_FRAG_TIMEOUT; + frame->timer.data = (unsigned long)frame; + frame->timer.function = lowpan_fragment_timer_expired; + + add_timer(&frame->timer); + + list_add_tail(&frame->list, &lowpan_fragments); + } + + if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1) + goto unlock_and_drop; + + offset = lowpan_fetch_skb_u8(skb); /* fetch offset */ + + /* if payload fits buffer, copy it */ + if (likely((offset * 8 + skb->len) <= frame->length)) + skb_copy_to_linear_data_offset(frame->skb, offset * 8, + skb->data, skb->len); + else + goto unlock_and_drop; + + frame->bytes_rcv += skb->len; + + /* frame assembling complete */ + if ((frame->bytes_rcv == frame->length) && + frame->timer.expires > jiffies) { + /* if timer haven't expired - first of all delete it */ + del_timer(&frame->timer); + list_del(&frame->list); + spin_unlock(&flist_lock); + + dev_kfree_skb(skb); + skb = frame->skb; + kfree(frame); + iphc0 = lowpan_fetch_skb_u8(skb); + break; + } + spin_unlock(&flist_lock); + + return kfree_skb(skb), 0; + } + default: + break; + } + iphc1 = lowpan_fetch_skb_u8(skb); _saddr = mac_cb(skb)->sa.hwaddr; @@ -659,7 +898,10 @@ lowpan_process_data(struct sk_buff *skb) goto drop; } - /* TODO: UDP header parse */ + /* UDP data uncompression */ + if (iphc0 & LOWPAN_IPHC_NH_C) + if (lowpan_uncompress_udp_header(skb)) + goto drop; /* Not fragmented package */ hdr.payload_len = htons(skb->len); @@ -674,6 +916,9 @@ lowpan_process_data(struct sk_buff *skb) lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, sizeof(hdr)); return lowpan_skb_deliver(skb, &hdr); + +unlock_and_drop: + spin_unlock(&flist_lock); drop: kfree_skb(skb); return -EINVAL; @@ -692,18 +937,115 @@ static int lowpan_set_address(struct net_device *dev, void *p) return 0; } +static int lowpan_get_mac_header_length(struct sk_buff *skb) +{ + /* + * Currently long addressing mode is supported only, so the overall + * header size is 21: + * FC SeqNum DPAN DA SA Sec + * 2 + 1 + 2 + 8 + 8 + 0 = 21 + */ + return 21; +} + +static int +lowpan_fragment_xmit(struct sk_buff *skb, u8 *head, + int mlen, int plen, int offset) +{ + struct sk_buff *frag; + int hlen, ret; + + /* if payload length is zero, therefore it's a first fragment */ + hlen = (plen == 0 ? LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE); + + lowpan_raw_dump_inline(__func__, "6lowpan fragment header", head, hlen); + + frag = dev_alloc_skb(hlen + mlen + plen + IEEE802154_MFR_SIZE); + if (!frag) + return -ENOMEM; + + frag->priority = skb->priority; + frag->dev = skb->dev; + + /* copy header, MFR and payload */ + memcpy(skb_put(frag, mlen), skb->data, mlen); + memcpy(skb_put(frag, hlen), head, hlen); + + if (plen) + skb_copy_from_linear_data_offset(skb, offset + mlen, + skb_put(frag, plen), plen); + + lowpan_raw_dump_table(__func__, " raw fragment dump", frag->data, + frag->len); + + ret = dev_queue_xmit(frag); + + return ret; +} + +static int +lowpan_skb_fragmentation(struct sk_buff *skb) +{ + int err, header_length, payload_length, tag, offset = 0; + u8 head[5]; + + header_length = lowpan_get_mac_header_length(skb); + payload_length = skb->len - header_length; + tag = fragment_tag++; + + /* first fragment header */ + head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_length & 0x7); + head[1] = (payload_length >> 3) & 0xff; + head[2] = tag & 0xff; + head[3] = tag >> 8; + + err = lowpan_fragment_xmit(skb, head, header_length, 0, 0); + + /* next fragment header */ + head[0] &= ~LOWPAN_DISPATCH_FRAG1; + head[0] |= LOWPAN_DISPATCH_FRAGN; + + while ((payload_length - offset > 0) && (err >= 0)) { + int len = LOWPAN_FRAG_SIZE; + + head[4] = offset / 8; + + if (payload_length - offset < len) + len = payload_length - offset; + + err = lowpan_fragment_xmit(skb, head, header_length, + len, offset); + offset += len; + } + + return err; +} + static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) { - int err = 0; + int err = -1; pr_debug("(%s): package xmit\n", __func__); skb->dev = lowpan_dev_info(dev)->real_dev; if (skb->dev == NULL) { pr_debug("(%s) ERROR: no real wpan device found\n", __func__); - dev_kfree_skb(skb); - } else + goto error; + } + + if (skb->len <= IEEE802154_MTU) { err = dev_queue_xmit(skb); + goto out; + } + + pr_debug("(%s): frame is too big, fragmentation is needed\n", + __func__); + err = lowpan_skb_fragmentation(skb); +error: + dev_kfree_skb(skb); +out: + if (err < 0) + pr_debug("(%s): ERROR: xmit failed\n", __func__); return (err < 0 ? NETDEV_TX_BUSY : NETDEV_TX_OK); } @@ -730,13 +1072,12 @@ static void lowpan_setup(struct net_device *dev) dev->addr_len = IEEE802154_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); dev->type = ARPHRD_IEEE802154; - dev->features = NETIF_F_NO_CSUM; /* Frame Control + Sequence Number + Address fields + Security Header */ dev->hard_header_len = 2 + 1 + 20 + 14; dev->needed_tailroom = 2; /* FCS */ dev->mtu = 1281; dev->tx_queue_len = 0; - dev->flags = IFF_NOARP | IFF_BROADCAST; + dev->flags = IFF_BROADCAST | IFF_MULTICAST; dev->watchdog_timeo = 0; dev->netdev_ops = &lowpan_netdev_ops; @@ -765,8 +1106,15 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; /* check that it's our buffer */ - if ((skb->data[0] & 0xe0) == 0x60) + switch (skb->data[0] & 0xe0) { + case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ + case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ + case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */ lowpan_process_data(skb); + break; + default: + break; + } return NET_RX_SUCCESS; diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h index 5d8cf80b930d..aeff3f310482 100644 --- a/net/ieee802154/6lowpan.h +++ b/net/ieee802154/6lowpan.h @@ -159,6 +159,24 @@ #define LOWPAN_DISPATCH_FRAG1 0xc0 /* 11000xxx */ #define LOWPAN_DISPATCH_FRAGN 0xe0 /* 11100xxx */ +#define LOWPAN_DISPATCH_MASK 0xf8 /* 11111000 */ + +#define LOWPAN_FRAG_TIMEOUT (HZ * 60) /* time-out 60 sec */ + +#define LOWPAN_FRAG1_HEAD_SIZE 0x4 +#define LOWPAN_FRAGN_HEAD_SIZE 0x5 + +/* + * According IEEE802.15.4 standard: + * - MTU is 127 octets + * - maximum MHR size is 37 octets + * - MFR size is 2 octets + * + * so minimal payload size that we may guarantee is: + * MTU - MHR - MFR = 88 octets + */ +#define LOWPAN_FRAG_SIZE 88 + /* * Values of fields within the IPHC encoding first byte * (C stands for compressed and I for inline) @@ -201,6 +219,11 @@ #define LOWPAN_NHC_UDP_CHECKSUMC 0x04 #define LOWPAN_NHC_UDP_CHECKSUMI 0x00 +#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0 +#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0 +#define LOWPAN_NHC_UDP_8BIT_PORT 0xF000 +#define LOWPAN_NHC_UDP_8BIT_MASK 0xFF00 + /* values for port compression, _with checksum_ ie bit 5 set to 0 */ #define LOWPAN_NHC_UDP_CS_P_00 0xF0 /* all inline */ #define LOWPAN_NHC_UDP_CS_P_01 0xF1 /* source 16bit inline, diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index faecf648123f..1b09eaabaac1 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -209,6 +209,7 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, unsigned mtu; struct sk_buff *skb; struct dgram_sock *ro = dgram_sk(sk); + int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { @@ -229,13 +230,15 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, mtu = dev->mtu; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); - skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size, + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 10970ca85748..f96bae8fd330 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -108,6 +108,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct net_device *dev; unsigned mtu; struct sk_buff *skb; + int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { @@ -137,12 +138,14 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_dev; } - skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size, + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_mac_header(skb); skb_reset_network_header(skb); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1b5096a9875a..15dc4c4828de 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1250,7 +1250,8 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; @@ -1572,9 +1573,9 @@ static __net_init int ipv4_mib_init_net(struct net *net) sizeof(struct icmp_mib), __alignof__(struct icmp_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, - sizeof(struct icmpmsg_mib), - __alignof__(struct icmpmsg_mib)) < 0) + net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), + GFP_KERNEL); + if (!net->mib.icmpmsg_statistics) goto err_icmpmsg_mib; tcp_mib_init(net); @@ -1598,7 +1599,7 @@ err_tcp_mib: static __net_exit void ipv4_mib_exit_net(struct net *net) { - snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); + kfree(net->mib.icmpmsg_statistics); snmp_mib_free((void __percpu **)net->mib.icmp_statistics); snmp_mib_free((void __percpu **)net->mib.udplite_statistics); snmp_mib_free((void __percpu **)net->mib.udp_statistics); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index c1f4154552fc..36d14406261e 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -136,8 +136,6 @@ static void ah_output_done(struct crypto_async_request *base, int err) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - err = ah->nexthdr; - kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb, err); } @@ -264,12 +262,12 @@ static void ah_input_done(struct crypto_async_request *base, int err) if (err) goto out; + err = ah->nexthdr; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); skb_set_transport_header(skb, -ihl); - - err = ah->nexthdr; out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); @@ -371,8 +369,6 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) - err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 96a164aa1367..381a0876c363 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -112,11 +112,6 @@ #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) -#include <net/atmclip.h> -struct neigh_table *clip_tbl_hook; -EXPORT_SYMBOL(clip_tbl_hook); -#endif #include <asm/system.h> #include <linux/uaccess.h> @@ -164,7 +159,6 @@ static const struct neigh_ops arp_broken_ops = { struct neigh_table arp_tbl = { .family = AF_INET, - .entry_size = sizeof(struct neighbour) + 4, .key_len = 4, .hash = arp_hash, .constructor = arp_constructor, @@ -177,7 +171,7 @@ struct neigh_table arp_tbl = { .gc_staletime = 60 * HZ, .reachable_time = 30 * HZ, .delay_probe_time = 5 * HZ, - .queue_len = 3, + .queue_len_bytes = 64*1024, .ucast_probes = 3, .mcast_probes = 3, .anycast_delay = 1 * HZ, @@ -283,9 +277,9 @@ static int arp_constructor(struct neighbour *neigh) default: break; case ARPHRD_ROSE: -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: -#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: #endif neigh->ops = &arp_broken_ops; @@ -592,16 +586,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; /* * Allocate a buffer */ - skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); + skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (skb == NULL) return NULL; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; @@ -633,13 +629,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pro = htons(ETH_P_IP); break; -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; -#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); @@ -647,13 +643,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, #endif #endif -#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) +#if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif -#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) +#if IS_ENABLED(CONFIG_TR) case ARPHRD_IEEE802_TR: arp->ar_hrd = htons(ARPHRD_IEEE802); arp->ar_pro = htons(ETH_P_IP); @@ -1040,7 +1036,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, return -EINVAL; } switch (dev->type) { -#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) +#if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP @@ -1286,7 +1282,7 @@ void __init arp_init(void) } #ifdef CONFIG_PROC_FS -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25) /* ------------------------------------------------------------------------ */ /* @@ -1334,7 +1330,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { @@ -1347,7 +1343,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, if (k != 0) --k; hbuffer[k] = 0; -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c6b5092f29a1..65f01dc47565 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1490,7 +1490,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; @@ -1501,6 +1503,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); + if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) + if ((new_value == 0) && (old_value != 0)) + rt_cache_flush(net, 0); } return ret; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c7472eff2d51..fa057d105bef 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -304,9 +304,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; while (1) { - skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), + skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; @@ -327,7 +329,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); pip = ip_hdr(skb); @@ -647,6 +649,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, __be32 group = pmc ? pmc->multiaddr : 0; struct flowi4 fl4; __be32 dst; + int hlen, tlen; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); @@ -661,7 +664,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (IS_ERR(rt)) return -1; - skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); return -1; @@ -669,7 +674,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, skb_dst_set(skb, &rt->dst); - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); iph = ip_hdr(skb); @@ -1574,7 +1579,7 @@ out_unlock: * Add multicast single-source filter to the interface list */ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, - __be32 *psfsrc, int delta) + __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; @@ -1709,14 +1714,15 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]++; err = 0; for (i=0; i<sfcount; i++) { - err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; - pmc->sfcount[sfmode]--; + if (!delta) + pmc->sfcount[sfmode]--; for (j=0; j<i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c14d88ad348d..a598768c616c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -588,10 +588,19 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); -struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, - const gfp_t priority) +/** + * inet_csk_clone_lock - clone an inet socket, and lock its clone + * @sk: the socket to clone + * @req: request_sock + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *inet_csk_clone_lock(const struct sock *sk, + const struct request_sock *req, + const gfp_t priority) { - struct sock *newsk = sk_clone(sk, priority); + struct sock *newsk = sk_clone_lock(sk, priority); if (newsk != NULL) { struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -615,7 +624,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, } return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone); +EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f5e2bdaef949..0a46c541b477 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -108,9 +108,6 @@ static int inet_csk_diag_fill(struct sock *sk, icsk->icsk_ca_ops->name); } - if ((ext & (1 << (INET_DIAG_TOS - 1))) && (sk->sk_family != AF_INET6)) - RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); - r->idiag_family = sk->sk_family; r->idiag_state = sk->sk_state; r->idiag_timer = 0; @@ -125,16 +122,20 @@ static int inet_csk_diag_fill(struct sock *sk, r->id.idiag_src[0] = inet->inet_rcv_saddr; r->id.idiag_dst[0] = inet->inet_daddr; + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, + * hence this needs to be included regardless of socket family. + */ + if (ext & (1 << (INET_DIAG_TOS - 1))) + RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); + #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { const struct ipv6_pinfo *np = inet6_sk(sk); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &np->daddr); - if (ext & (1 << (INET_DIAG_TOS - 1))) - RTA_PUT_U8(skb, INET_DIAG_TOS, np->tclass); + *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = np->daddr; + if (ext & (1 << (INET_DIAG_TCLASS - 1))) + RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); } #endif @@ -224,10 +225,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, const struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tw6->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tw6->tw_v6_daddr); + *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; } #endif nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; @@ -603,10 +602,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &inet6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &inet6_rsk(req)->rmt_addr); + *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr; + *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr; } #endif nlh->nlmsg_len = skb_tail_pointer(skb) - b; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3b34d1c86270..29a07b6c7168 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) + if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d55110e93120..fe070c1593ab 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -171,7 +171,7 @@ struct pcpu_tstats { unsigned long rx_bytes; unsigned long tx_packets; unsigned long tx_bytes; -}; +} __attribute__((aligned(4*sizeof(unsigned long)))); static struct net_device_stats *ipgre_get_stats(struct net_device *dev) { @@ -731,7 +731,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { - struct neighbour *neigh = dst_get_neighbour(skb_dst(skb)); + struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb)); const struct in6_addr *addr6; int addr_type; @@ -835,6 +835,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); dev->stats.tx_dropped++; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec93335901dd..1e60f7679075 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -568,12 +568,13 @@ void ip_forward_options(struct sk_buff *skb) ) { if (srrptr + 3 > srrspace) break; - if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) + if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_rt_get_source(&optptr[srrptr-1], skb, rt); + ip_hdr(skb)->daddr = opt->nexthop; optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); @@ -640,6 +641,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) } if (srrptr <= srrspace) { opt->srr_is_hit = 1; + opt->nexthop = nexthop; opt->is_changed = 1; } return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0bc95f3977d2..ff302bde8890 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -206,7 +206,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) } rcu_read_lock(); - neigh = dst_get_neighbour(dst); + neigh = dst_get_neighbour_noref(dst); if (neigh) { int res = neigh_output(neigh, skb); @@ -319,6 +319,20 @@ int ip_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } +/* + * copy saddr and daddr, possibly using 64bit load/stores + * Equivalent to : + * iph->saddr = fl4->saddr; + * iph->daddr = fl4->daddr; + */ +static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) +{ + BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != + offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); + memcpy(&iph->saddr, &fl4->saddr, + sizeof(fl4->saddr) + sizeof(fl4->daddr)); +} + int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) { struct sock *sk = skb->sk; @@ -381,8 +395,8 @@ packet_routed: iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; - iph->saddr = fl4->saddr; - iph->daddr = fl4->daddr; + ip_copy_addrs(iph, fl4); + /* Transport layer set skb->h.foo itself. */ if (inet_opt && inet_opt->opt.optlen) { @@ -1337,8 +1351,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; - iph->saddr = fl4->saddr; - iph->daddr = fl4->daddr; + ip_copy_addrs(iph, fl4); if (opt) { iph->ihl += opt->optlen>>2; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 09ff51bf16a4..80d5fa450210 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -55,20 +55,13 @@ /* * SOL_IP control messages. */ +#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { - struct in_pktinfo info; - struct rtable *rt = skb_rtable(skb); + struct in_pktinfo info = *PKTINFO_SKB_CB(skb); info.ipi_addr.s_addr = ip_hdr(skb)->daddr; - if (rt) { - info.ipi_ifindex = rt->rt_iif; - info.ipi_spec_dst.s_addr = rt->rt_spec_dst; - } else { - info.ipi_ifindex = 0; - info.ipi_spec_dst.s_addr = 0; - } put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } @@ -992,20 +985,28 @@ e_inval: } /** - * ip_queue_rcv_skb - Queue an skb into sock receive queue + * ipv4_pktinfo_prepare - transfert some info from rtable to skb * @sk: socket * @skb: buffer * - * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option - * is not set, we drop skb dst entry now, while dst cache line is hot. + * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst + * in skb->cb[] before dst drop. + * This way, receiver doesnt make cache line misses to read rtable. */ -int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +void ipv4_pktinfo_prepare(struct sk_buff *skb) { - if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) - skb_dst_drop(skb); - return sock_queue_rcv_skb(sk, skb); + struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); + const struct rtable *rt = skb_rtable(skb); + + if (rt) { + pktinfo->ipi_ifindex = rt->rt_iif; + pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + pktinfo->ipi_ifindex = 0; + pktinfo->ipi_spec_dst.s_addr = 0; + } + skb_dst_drop(skb); } -EXPORT_SYMBOL(ip_queue_rcv_skb); int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 0da2afc97f32..915eb5265b2e 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -763,13 +763,15 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d struct sk_buff *skb; struct bootp_pkt *b; struct iphdr *h; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; /* Allocate packet */ - skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, + skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15, GFP_KERNEL); if (!skb) return; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); memset(b, 0, sizeof(struct bootp_pkt)); @@ -822,8 +824,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d skb->dev = dev; skb->protocol = htons(ETH_P_IP); if (dev_hard_header(skb, dev, ntohs(skb->protocol), - dev->broadcast, dev->dev_addr, skb->len) < 0 || - dev_queue_xmit(skb) < 0) + dev->broadcast, dev->dev_addr, skb->len) < 0) { + kfree_skb(skb); + printk("E"); + return; + } + + if (dev_queue_xmit(skb) < 0) printk("E"); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 065effd8349a..94906908a416 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -148,7 +148,7 @@ struct pcpu_tstats { unsigned long rx_bytes; unsigned long tx_packets; unsigned long tx_bytes; -}; +} __attribute__((aligned(4*sizeof(unsigned long)))); static struct net_device_stats *ipip_get_stats(struct net_device *dev) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 76a7f07b38b6..8e54490ee3f4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1520,7 +1520,6 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct mr_table *mrt; struct vif_device *v; int ct; - LIST_HEAD(list); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; @@ -1529,10 +1528,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (v->dev == dev) - vif_delete(mrt, ct, 1, &list); + vif_delete(mrt, ct, 1, NULL); } } - unregister_netdevice_many(&list); return NOTIFY_DONE; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 9899619ab9b8..4f47e064e262 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -64,7 +64,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && - pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), + 0, GFP_ATOMIC)) return -1; return 0; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1dfc18a03fd4..f19f2182894c 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -325,7 +325,6 @@ config IP_NF_TARGET_TTL # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' - depends on NETFILTER_ADVANCED help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index e59aabd0eae4..a057fe64debd 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -404,6 +404,7 @@ __ipq_rcv_skb(struct sk_buff *skb) int status, type, pid, flags; unsigned int nlmsglen, skblen; struct nlmsghdr *nlh; + bool enable_timestamp = false; skblen = skb->len; if (skblen < sizeof(*nlh)) @@ -441,12 +442,13 @@ __ipq_rcv_skb(struct sk_buff *skb) RCV_SKB_FAIL(-EBUSY); } } else { - net_enable_timestamp(); + enable_timestamp = true; peer_pid = pid; } spin_unlock_bh(&queue_lock); - + if (enable_timestamp) + net_enable_timestamp(); status = ipq_receive_peer(NLMSG_DATA(nlh), type, nlmsglen - NLMSG_LENGTH(0)); if (status < 0) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a06f73fdb3c0..43d4c3b22369 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -339,7 +339,6 @@ void ping_err(struct sk_buff *skb, u32 info) sk = ping_v4_lookup(net, iph->daddr, iph->saddr, ntohs(icmph->un.echo.id), skb->dev->ifindex); if (sk == NULL) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); pr_debug("no socket, dropping\n"); return; /* No socket for error */ } @@ -679,7 +678,6 @@ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb(sk, skb) < 0) { - ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); kfree_skb(skb); pr_debug("ping_queue_rcv_skb -> failed\n"); return -1; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 466ea8bb7a4d..961eed4f510a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,7 +288,7 @@ static void icmpmsg_put(struct seq_file *seq) count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); + val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); if (val) { type[count] = i; vals[count++] = val; @@ -307,6 +307,7 @@ static void icmp_put(struct seq_file *seq) { int i; struct net *net = seq->private; + atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors"); for (i=0; icmpmibmap[i].name != NULL; i++) @@ -319,15 +320,13 @@ static void icmp_put(struct seq_file *seq) snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, - icmpmibmap[i].index)); + atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, - icmpmibmap[i].index | 0x100)); + atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 007e2eb769d3..3ccda5ae8a27 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -292,7 +292,8 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (ip_queue_rcv_skb(sk, skb) < 0) { + ipv4_pktinfo_prepare(skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } @@ -327,6 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, unsigned int iphlen; int err; struct rtable *rt = *rtp; + int hlen, tlen; if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -336,12 +338,14 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, if (flags&MSG_PROBE) goto out; + hlen = LL_RESERVED_SPACE(rt->dst.dev); + tlen = rt->dst.dev->needed_tailroom; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, + length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); + skb_reserve(skb, hlen); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 155138d8ec8b..90402a2a26a9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -108,7 +108,6 @@ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif -#include <net/atmclip.h> #include <net/secure_seq.h> #define RT_FL_TOS(oldflp4) \ @@ -131,6 +130,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; +static int redirect_genid; /* * Interface to generic destination cache. @@ -138,7 +138,7 @@ static int rt_chain_length_max __read_mostly = 20; static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); -static unsigned int ipv4_default_mtu(const struct dst_entry *dst); +static unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); @@ -193,7 +193,7 @@ static struct dst_ops ipv4_dst_ops = { .gc = rt_garbage_collect, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, - .default_mtu = ipv4_default_mtu, + .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, @@ -416,9 +416,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) else { struct rtable *r = v; struct neighbour *n; - int len; + int len, HHUptod; + + rcu_read_lock(); + n = dst_get_neighbour_noref(&r->dst); + HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; + rcu_read_unlock(); - n = dst_get_neighbour(&r->dst); seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->dst.dev ? r->dst.dev->name : "*", @@ -432,7 +436,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_RTTVAR)), r->rt_key_tos, -1, - (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0, + HHUptod, r->rt_spec_dst, &len); seq_printf(seq, "%*s\n", 127 - len, ""); @@ -837,6 +841,7 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); + redirect_genid++; } /* @@ -1013,23 +1018,18 @@ static int slow_chain_length(const struct rtable *head) static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { - struct neigh_table *tbl = &arp_tbl; static const __be32 inaddr_any = 0; struct net_device *dev = dst->dev; const __be32 *pkey = daddr; struct neighbour *n; -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - if (dev->type == ARPHRD_ATM) - tbl = clip_tbl_hook; -#endif if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) pkey = &inaddr_any; - n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); + n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey); if (n) return n; - return neigh_create(tbl, pkey, dev); + return neigh_create(&arp_tbl, pkey, dev); } static int rt_bind_neighbour(struct rtable *rt) @@ -1304,16 +1304,42 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + struct neighbour *n, *old_n; + + dst_confirm(&rt->dst); + + rt->rt_gateway = peer->redirect_learned.a4; + + n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + old_n = xchg(&rt->dst._neighbour, n); + if (old_n) + neigh_release(old_n); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + } + return 0; +} + /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rt; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; - struct flowi4 fl4; struct inet_peer *peer; struct net *net; @@ -1336,33 +1362,44 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = daddr; for (s = 0; s < 2; s++) { for (i = 0; i < 2; i++) { - fl4.flowi4_oif = ikeys[i]; - fl4.saddr = skeys[s]; - rt = __ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - continue; - - if (rt->dst.error || rt->dst.dev != dev || - rt->rt_gateway != old_gw) { - ip_rt_put(rt); - continue; - } - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); + unsigned int hash; + struct rtable __rcu **rthp; + struct rtable *rt; + + hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); + + rthp = &rt_hash_table[hash].chain; + + while ((rt = rcu_dereference(*rthp)) != NULL) { + rthp = &rt->dst.rt_next; + + if (rt->rt_key_dst != daddr || + rt->rt_key_src != skeys[s] || + rt->rt_oif != ikeys[i] || + rt_is_input_route(rt) || + rt_is_expired(rt) || + !net_eq(dev_net(rt->dst.dev), net) || + rt->dst.error || + rt->dst.dev != dev || + rt->rt_gateway != old_gw) + continue; - peer = rt->peer; - if (peer) { - peer->redirect_learned.a4 = new_gw; - atomic_inc(&__rt_peer_genid); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + if (peer->redirect_learned.a4 != new_gw || + peer->redirect_genid != redirect_genid) { + peer->redirect_learned.a4 = new_gw; + peer->redirect_genid = redirect_genid; + atomic_inc(&__rt_peer_genid); + } + check_peer_redir(&rt->dst, peer); + } } - - ip_rt_put(rt); - return; } } return; @@ -1649,40 +1686,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } } -static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) -{ - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - struct neighbour *n, *old_n; - - dst_confirm(&rt->dst); - - rt->rt_gateway = peer->redirect_learned.a4; - n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); - if (IS_ERR(n)) - return PTR_ERR(n); - old_n = xchg(&rt->dst._neighbour, n); - if (old_n) - neigh_release(old_n); - if (!n || !(n->nud_state & NUD_VALID)) { - if (n) - neigh_event_send(n, NULL); - rt->rt_gateway = orig_gw; - return -EAGAIN; - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); - } - return 0; -} - -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +static struct rtable *ipv4_validate_peer(struct rtable *rt) { - struct rtable *rt = (struct rtable *) dst; - - if (rt_is_expired(rt)) - return NULL; if (rt->rt_peer_genid != rt_peer_genid()) { struct inet_peer *peer; @@ -1691,17 +1697,29 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) peer = rt->peer; if (peer) { - check_peer_pmtu(dst, peer); + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_genid != redirect_genid) + peer->redirect_learned.a4 = 0; if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { - if (check_peer_redir(dst, peer)) + if (check_peer_redir(&rt->dst, peer)) return NULL; } } rt->rt_peer_genid = rt_peer_genid(); } + return rt; +} + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) + return NULL; + dst = (struct dst_entry *) ipv4_validate_peer(rt); return dst; } @@ -1806,12 +1824,17 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) return advmss; } -static unsigned int ipv4_default_mtu(const struct dst_entry *dst) +static unsigned int ipv4_mtu(const struct dst_entry *dst) { - unsigned int mtu = dst->dev->mtu; + const struct rtable *rt = (const struct rtable *) dst; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu && rt_is_output_route(rt)) + return mtu; + + mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - const struct rtable *rt = (const struct rtable *) dst; if (rt->rt_gateway != rt->rt_dst && mtu > 576) mtu = 576; @@ -1844,6 +1867,8 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, peer->metrics, false); check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_genid != redirect_genid) + peer->redirect_learned.a4 = 0; if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; @@ -2349,6 +2374,9 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { + rth = ipv4_validate_peer(rth); + if (!rth) + continue; if (noref) { dst_use_noref(&rth->dst, jiffies); skb_dst_set_noref(skb, &rth->dst); @@ -2724,6 +2752,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { + rth = ipv4_validate_peer(rth); + if (!rth) + continue; dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2747,9 +2778,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo return NULL; } -static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) { - return 0; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; } static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -2767,7 +2800,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, - .default_mtu = ipv4_blackhole_default_mtu, + .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, .cow_metrics = ipv4_rt_blackhole_cow_metrics, @@ -2845,7 +2878,7 @@ static int rt_fill_info(struct net *net, struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires = 0; + unsigned long expires = 0; const struct inet_peer *peer = rt->peer; u32 id = 0, ts = 0, tsage = 0, error; @@ -2902,8 +2935,12 @@ static int rt_fill_info(struct net *net, tsage = get_seconds() - peer->tcp_ts_stamp; } expires = ACCESS_ONCE(peer->pmtu_expires); - if (expires) - expires -= jiffies; + if (expires) { + if (time_before(jiffies, expires)) + expires -= jiffies; + else + expires = 0; + } } if (rt_is_input_route(rt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 34f5db1e1c8b..43dfccce62e9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -888,18 +888,18 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, } EXPORT_SYMBOL(tcp_sendpage); -#define TCP_PAGE(sk) (sk->sk_sndmsg_page) -#define TCP_OFF(sk) (sk->sk_sndmsg_off) - -static inline int select_size(const struct sock *sk, int sg) +static inline int select_size(const struct sock *sk, bool sg) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { - if (sk_can_gso(sk)) - tmp = 0; - else { + if (sk_can_gso(sk)) { + /* Small frames wont use a full page: + * Payload will immediately follow tcp header. + */ + tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); + } else { int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); if (tmp >= pgbreak && @@ -917,9 +917,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags; + int iovlen, flags, err, copied; int mss_now, size_goal; - int sg, err, copied; + bool sg; long timeo; lock_sock(sk); @@ -946,7 +946,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; - sg = sk->sk_route_caps & NETIF_F_SG; + sg = !!(sk->sk_route_caps & NETIF_F_SG); while (--iovlen >= 0) { size_t seglen = iov->iov_len; @@ -1005,8 +1005,13 @@ new_segment: } else { int merge = 0; int i = skb_shinfo(skb)->nr_frags; - struct page *page = TCP_PAGE(sk); - int off = TCP_OFF(sk); + struct page *page = sk->sk_sndmsg_page; + int off; + + if (page && page_count(page) == 1) + sk->sk_sndmsg_off = 0; + + off = sk->sk_sndmsg_off; if (skb_can_coalesce(skb, i, page, off) && off != PAGE_SIZE) { @@ -1023,7 +1028,7 @@ new_segment: } else if (page) { if (off == PAGE_SIZE) { put_page(page); - TCP_PAGE(sk) = page = NULL; + sk->sk_sndmsg_page = page = NULL; off = 0; } } else @@ -1049,9 +1054,9 @@ new_segment: /* If this page was new, give it to the * socket so it does not get leaked. */ - if (!TCP_PAGE(sk)) { - TCP_PAGE(sk) = page; - TCP_OFF(sk) = 0; + if (!sk->sk_sndmsg_page) { + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; } goto do_error; } @@ -1061,15 +1066,15 @@ new_segment: skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, page, off, copy); - if (TCP_PAGE(sk)) { + if (sk->sk_sndmsg_page) { get_page(page); } else if (off + copy < PAGE_SIZE) { get_page(page); - TCP_PAGE(sk) = page; + sk->sk_sndmsg_page = page; } } - TCP_OFF(sk) = off + copy; + sk->sk_sndmsg_off = off + copy; } if (!copied) @@ -2653,7 +2658,8 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 850c737e08e2..fc6d475f488f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -292,7 +292,7 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && left * tp->mss_cache < sk->sk_gso_max_size) return 1; - return left <= tcp_max_burst(tp); + return left <= tcp_max_tso_deferred_mss(tp); } EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 52b5c2d0ecd0..0cbb44076cfa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2858,7 +2858,7 @@ static void tcp_try_keep_open(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; - if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) + if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2881,7 +2881,8 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - tcp_moderate_cwnd(tp); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); } @@ -3009,11 +3010,11 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, int flag) + int newly_acked_sacked, bool is_dupack, + int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); int fast_rexmit = 0, mib_idx; @@ -3066,17 +3067,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } break; - case TCP_CA_Disorder: - tcp_try_undo_dsack(sk); - if (!tp->undo_marker || - /* For SACK case do not Open to allow to undo - * catching for all duplicate ACKs. */ - tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Open); - } - break; - case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); @@ -3117,7 +3107,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } - if (icsk->icsk_ca_state == TCP_CA_Disorder) + if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk)) { @@ -3681,10 +3671,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; + bool is_dupack = false; u32 prior_in_flight; u32 prior_fackets; int prior_packets; int prior_sacked = tp->sacked_out; + int pkts_acked = 0; int newly_acked_sacked = 0; int frto_cwnd = 0; @@ -3757,6 +3749,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + pkts_acked = prior_packets - tp->packets_out; newly_acked_sacked = (prior_packets - prior_sacked) - (tp->packets_out - tp->sacked_out); @@ -3771,8 +3764,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); - tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, - newly_acked_sacked, flag); + is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -3784,6 +3778,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + /* If data was DSACKed, see if we can undo a cwnd reduction. */ + if (flag & FLAG_DSACKING_ACK) + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3797,10 +3795,14 @@ invalid_ack: return -1; old_ack: + /* If data was SACKed, tag it and see if we should send more data. + * If data was DSACKed, see if we can undo a cwnd reduction. + */ if (TCP_SKB_CB(skb)->sacked) { - tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (icsk->icsk_ca_state == TCP_CA_Open) - tcp_try_keep_open(sk); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + newly_acked_sacked = tp->sacked_out - prior_sacked; + tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + is_dupack, flag); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -5809,6 +5811,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if (th->syn) { + if (th->fin) + goto discard; if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a7443159c400..c4b8b09db9f5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1510,6 +1510,8 @@ exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: + tcp_clear_xmit_timers(newsk); + tcp_cleanup_congestion_control(newsk); bh_unlock_sock(newsk); sock_put(newsk); goto exit; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 66363b689ad6..9dc146e5ed65 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -343,8 +343,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); tw6 = inet6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw6->tw_v6_daddr = np->daddr; + tw6->tw_v6_rcv_saddr = np->rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_ipv6only = np->ipv6only; } @@ -425,7 +425,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); if (newsk != NULL) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -495,7 +495,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; - newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && + !try_module_get(newicsk->icsk_ca_ops->owner)) + newicsk->icsk_ca_ops = &tcp_init_congestion_ops; tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 980b98f6288c..cf3068038942 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1093,6 +1093,13 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; + eat = min_t(int, len, skb_headlen(skb)); + if (eat) { + __skb_pull(skb, eat); + len -= eat; + if (!len) + return; + } eat = len; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { @@ -1124,11 +1131,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; - /* If len == headlen, we avoid __skb_pull to preserve alignment. */ - if (unlikely(len < skb_headlen(skb))) - __skb_pull(skb, len); - else - __pskb_trim_head(skb, len - skb_headlen(skb)); + __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL; @@ -1382,7 +1385,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) /* Return 0, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ @@ -1581,7 +1584,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) * frame, so if we have space for more than 3 frames * then send now. */ - if (limit > tcp_max_burst(tp) * tp->mss_cache) + if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } @@ -2147,7 +2150,15 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + /* make sure skb->data is aligned on arches that require it */ + if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, + GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } else { + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + } if (err == 0) { /* Update global TCP statistics. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ab0966df1e2a..ad481b32f1e3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1164,7 +1164,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - unsigned int ulen; + unsigned int ulen, copied; int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -1186,9 +1186,10 @@ try_again: goto out; ulen = skb->len - sizeof(struct udphdr); - if (len > ulen) - len = ulen; - else if (len < ulen) + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; /* @@ -1197,14 +1198,14 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { if (udp_lib_checksum_complete(skb)) goto csum_copy_err; } if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, len); + msg->msg_iov, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), @@ -1233,7 +1234,7 @@ try_again: if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); - err = len; + err = copied; if (flags & MSG_TRUNC) err = ulen; @@ -1357,7 +1358,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) sock_rps_save_rxhash(sk, skb); - rc = ip_queue_rcv_skb(sk, skb); + rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1473,6 +1474,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = 0; + ipv4_pktinfo_prepare(skb); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); @@ -2246,7 +2248,8 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index cf88df82e2c2..058cc222b3f1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -636,7 +636,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out; } - ipv6_addr_copy(&ifa->addr, addr); + ifa->addr = *addr; spin_lock_init(&ifa->lock); spin_lock_init(&ifa->state_lock); @@ -657,7 +657,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, * layer address of our nexhop router */ - if (dst_get_neighbour_raw(&rt->dst) == NULL) + if (dst_get_neighbour_noref_raw(&rt->dst) == NULL) ifa->flags &= ~IFA_F_OPTIMISTIC; ifa->idev = idev; @@ -1228,7 +1228,7 @@ try_nextdev: if (!hiscore->ifa) return -EADDRNOTAVAIL; - ipv6_addr_copy(saddr, &hiscore->ifa->addr); + *saddr = hiscore->ifa->addr; in6_ifa_put(hiscore->ifa); return 0; } @@ -1249,7 +1249,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, list_for_each_entry(ifp, &idev->addr_list, if_list) { if (ifp->scope == IFA_LINK && !(ifp->flags & banned_flags)) { - ipv6_addr_copy(addr, &ifp->addr); + *addr = ifp->addr; err = 0; break; } @@ -1700,7 +1700,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_protocol = RTPROT_KERNEL, }; - ipv6_addr_copy(&cfg.fc_dst, pfx); + cfg.fc_dst = *pfx; /* Prevent useless cloning on PtP SIT. This thing is done here expecting that the whole diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d27c797f9f05..7694c82e629d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -347,7 +347,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!inet->transparent && + if (!(inet->freebind || inet->transparent) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; @@ -361,10 +361,10 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_rcv_saddr = v4addr; inet->inet_saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + np->rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ if (sk->sk_prot->get_port(sk, snum)) { @@ -458,14 +458,14 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, peer == 1) return -ENOTCONN; sin->sin6_port = inet->inet_dport; - ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + sin->sin6_addr = np->daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&np->rcv_saddr)) - ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + sin->sin6_addr = np->saddr; else - ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + sin->sin6_addr = np->rcv_saddr; sin->sin6_port = inet->inet_sport; } @@ -660,8 +660,8 @@ int inet6_sk_rebuild_header(struct sock *sk) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = sk->sk_protocol; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; fl6.flowlabel = np->flow_label; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; @@ -769,7 +769,8 @@ out: return err; } -static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features) +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; @@ -985,9 +986,9 @@ static int __net_init ipv6_init_mibs(struct net *net) sizeof(struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void __percpu **)net->mib.icmpv6msg_statistics, - sizeof(struct icmpv6msg_mib), - __alignof__(struct icmpv6msg_mib)) < 0) + net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), + GFP_KERNEL); + if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; return 0; @@ -1008,7 +1009,7 @@ static void ipv6_cleanup_mibs(struct net *net) snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); - snmp_mib_free((void __percpu **)net->mib.icmpv6msg_statistics); + kfree(net->mib.icmpv6msg_statistics); } static int __net_init inet6_net_init(struct net *net) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 2195ae651923..2ae79dbeec2f 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -193,9 +193,9 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length); goto bad; } - ipv6_addr_copy(&final_addr, &hao->addr); - ipv6_addr_copy(&hao->addr, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &final_addr); + final_addr = hao->addr; + hao->addr = iph->saddr; + iph->saddr = final_addr; } break; } @@ -241,13 +241,13 @@ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) segments = rthdr->hdrlen >> 1; addrs = ((struct rt0_hdr *)rthdr)->addr; - ipv6_addr_copy(&final_addr, addrs + segments - 1); + final_addr = addrs[segments - 1]; addrs += segments - segments_left; memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); - ipv6_addr_copy(addrs, &iph->daddr); - ipv6_addr_copy(&iph->daddr, &final_addr); + addrs[0] = iph->daddr; + iph->daddr = final_addr; } static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) @@ -324,8 +324,6 @@ static void ah6_output_done(struct crypto_async_request *base, int err) #endif } - err = ah->nexthdr; - kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb, err); } @@ -466,12 +464,12 @@ static void ah6_input_done(struct crypto_async_request *base, int err) if (err) goto out; + err = ah->nexthdr; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, hdr_len); __skb_pull(skb, ah_hlen + hdr_len); skb_set_transport_header(skb, -hdr_len); - - err = ah->nexthdr; out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); @@ -583,8 +581,6 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) - err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 674255f5e6b7..fc1cdcd7041a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -75,7 +75,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (pac == NULL) return -ENOMEM; pac->acl_next = NULL; - ipv6_addr_copy(&pac->acl_addr, addr); + pac->acl_addr = *addr; rcu_read_lock(); if (ifindex == 0) { @@ -296,7 +296,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) goto out; } - ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_addr = *addr; aca->aca_idev = idev; aca->aca_rt = rt; aca->aca_users = 1; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e2480691c220..ae08aee1773c 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -71,7 +71,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + usin->sin6_addr = flowlabel->dst; } } @@ -143,7 +143,7 @@ ipv4_connected: } } - ipv6_addr_copy(&np->daddr, daddr); + np->daddr = *daddr; np->flow_label = fl6.flowlabel; inet->inet_dport = usin->sin6_port; @@ -154,8 +154,8 @@ ipv4_connected: */ fl6.flowi6_proto = sk->sk_protocol; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; @@ -179,10 +179,10 @@ ipv4_connected: /* source address lookup done in ip6_dst_lookup */ if (ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&np->saddr, &fl6.saddr); + np->saddr = fl6.saddr; if (ipv6_addr_any(&np->rcv_saddr)) { - ipv6_addr_copy(&np->rcv_saddr, &fl6.saddr); + np->rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); @@ -257,7 +257,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); - ipv6_addr_copy(&iph->daddr, &fl6->daddr); + iph->daddr = fl6->daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; @@ -294,7 +294,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); - ipv6_addr_copy(&iph->daddr, &fl6->daddr); + iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); @@ -303,7 +303,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; - ipv6_addr_copy(&mtu_info->ip6m_addr.sin6_addr, &ipv6_hdr(skb)->daddr); + mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); @@ -354,8 +354,8 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) sin->sin6_port = serr->port; sin->sin6_scope_id = 0; if (skb->protocol == htons(ETH_P_IPV6)) { - ipv6_addr_copy(&sin->sin6_addr, - (struct in6_addr *)(nh + serr->addr_offset)); + sin->sin6_addr = + *(struct in6_addr *)(nh + serr->addr_offset); if (np->sndflow) sin->sin6_flowinfo = (*(__be32 *)(nh + serr->addr_offset - 24) & @@ -376,7 +376,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; if (skb->protocol == htons(ETH_P_IPV6)) { - ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr); + sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) datagram_recv_ctl(sk, msg, skb); if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) @@ -451,7 +451,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len) sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; - ipv6_addr_copy(&sin->sin6_addr, &mtu_info.ip6m_addr.sin6_addr); + sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); @@ -475,7 +475,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } @@ -550,7 +550,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { @@ -584,7 +584,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) */ sin6.sin6_family = AF_INET6; - ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr); + sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = 0; @@ -654,12 +654,12 @@ int datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!inet_sk(sk)->transparent && + if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && !ipv6_chk_addr(net, &src_info->ipi6_addr, strict ? dev : NULL, 0)) err = -EINVAL; else - ipv6_addr_copy(&fl6->saddr, &src_info->ipi6_addr); + fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bf22a225f422..3d641b6e9b09 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -243,9 +243,9 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; - ipv6_addr_copy(&tmp_addr, &ipv6h->saddr); - ipv6_addr_copy(&ipv6h->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp_addr); + tmp_addr = ipv6h->saddr; + ipv6h->saddr = hao->addr; + hao->addr = tmp_addr; if (skb->tstamp.tv64 == 0) __net_timestamp(skb); @@ -461,9 +461,9 @@ looped_back: return -1; } - ipv6_addr_copy(&daddr, addr); - ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr); - ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr); + daddr = *addr; + *addr = ipv6_hdr(skb)->daddr; + ipv6_hdr(skb)->daddr = daddr; skb_dst_drop(skb); ip6_route_input(skb); @@ -690,7 +690,7 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); - ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; @@ -888,8 +888,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, if (!opt || !opt->srcrt) return NULL; - ipv6_addr_copy(orig, &fl6->daddr); - ipv6_addr_copy(&fl6->daddr, ((struct rt0_hdr *)opt->srcrt)->addr); + *orig = fl6->daddr; + fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; return orig; } diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 37f548b7f6dc..72957f4a7c6c 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -57,6 +57,9 @@ int ipv6_ext_hdr(u8 nexthdr) * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. + * - Reports the offset field of the final fragment header so it is + * possible to tell whether this is a first fragment, later fragment, + * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. @@ -64,10 +67,13 @@ int ipv6_ext_hdr(u8 nexthdr) * --ANK (980726) */ -int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, + __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + *frag_offp = 0; + while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; @@ -87,7 +93,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) if (fp == NULL) return -1; - if (ntohs(*fp) & ~0x7) + *frag_offp = *fp; + if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 295571576f83..b6c573152067 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -96,7 +96,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) goto again; - ipv6_addr_copy(&flp6->saddr, &saddr); + flp6->saddr = saddr; } goto out; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 90868fb42757..01d46bff63c3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -135,11 +135,12 @@ static int is_ineligible(struct sk_buff *skb) int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; if (len < 0) return 1; - ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr); + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) return 0; if (nexthdr == IPPROTO_ICMPV6) { @@ -290,9 +291,9 @@ static void mip6_addr_swap(struct sk_buff *skb) if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - ipv6_addr_copy(&tmp, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp); + tmp = iph->saddr; + iph->saddr = hao->addr; + hao->addr = tmp; } } } @@ -444,9 +445,9 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl6.daddr, &hdr->saddr); + fl6.daddr = hdr->saddr; if (saddr) - ipv6_addr_copy(&fl6.saddr, saddr); + fl6.saddr = *saddr; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; @@ -538,9 +539,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr); + fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) - ipv6_addr_copy(&fl6.saddr, saddr); + fl6.saddr = *saddr; fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); @@ -596,6 +597,7 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) int inner_offset; int hash; u8 nexthdr; + __be16 frag_off; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return; @@ -603,7 +605,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ - inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); if (inner_offset<0) return; } else { @@ -786,8 +789,8 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, int oif) { memset(fl6, 0, sizeof(*fl6)); - ipv6_addr_copy(&fl6->saddr, saddr); - ipv6_addr_copy(&fl6->daddr, daddr); + fl6->saddr = *saddr; + fl6->daddr = *daddr; fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index fee46d5a2f12..02dd203d9eac 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -65,9 +65,9 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); + fl6.daddr = treq->rmt_addr; final_p = fl6_update_dst(&fl6, np->opt, &final); - ipv6_addr_copy(&fl6.saddr, &treq->loc_addr); + fl6.saddr = treq->loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet_rsk(req)->rmt_port; @@ -85,7 +85,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk, * request_sock (formerly open request) hash tables. */ static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u16 synq_hsize) + const u32 rnd, const u32 synq_hsize) { u32 c; @@ -157,7 +157,7 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_addr = np->daddr; sin6->sin6_port = inet_sk(sk)->inet_dport; /* We do not store received flowlabel for TCP */ sin6->sin6_flowinfo = 0; @@ -215,8 +215,8 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = sk->sk_protocol; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; fl6.flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl6.flowlabel); fl6.flowi6_oif = sk->sk_bound_dev_if; @@ -246,7 +246,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) skb_dst_set_noref(skb, dst); /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl6.daddr, &np->daddr); + fl6.daddr = np->daddr; res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); rcu_read_unlock(); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 93718f3db79b..278363123657 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -190,7 +190,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (table != NULL) { + if (table) { table->tb6_id = id; table->tb6_root.leaf = net->ipv6.ip6_null_entry; table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; @@ -210,7 +210,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id) return tb; tb = fib6_alloc_table(net, id); - if (tb != NULL) + if (tb) fib6_link_table(net, tb); return tb; @@ -367,7 +367,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) s_e = cb->args[1]; w = (void *)cb->args[2]; - if (w == NULL) { + if (!w) { /* New dump: * * 1. hook callback destructor. @@ -379,7 +379,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (w == NULL) + if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -425,7 +425,8 @@ out: static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, int addrlen, int plen, - int offset) + int offset, int allow_create, + int replace_required) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -447,8 +448,18 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, * Prefix match */ if (plen < fn->fn_bit || - !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { + if (!allow_create) { + if (replace_required) { + pr_warn("IPv6: Can't replace route, " + "no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("IPv6: NLM_F_CREATE should be set " + "when creating new route\n"); + } goto insert_above; + } /* * Exact match ? @@ -456,7 +467,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, if (plen == fn->fn_bit) { /* clean up an intermediate node */ - if ((fn->fn_flags & RTN_RTINFO) == 0) { + if (!(fn->fn_flags & RTN_RTINFO)) { rt6_release(fn->leaf); fn->leaf = NULL; } @@ -477,6 +488,23 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, fn = dir ? fn->right: fn->left; } while (fn); + if (!allow_create) { + /* We should not create new node because + * NLM_F_REPLACE was specified without NLM_F_CREATE + * I assume it is safe to require NLM_F_CREATE when + * REPLACE flag is used! Later we may want to remove the + * check for replace_required, because according + * to netlink specification, NLM_F_CREATE + * MUST be specified if new route is created. + * That would keep IPv6 consistent with IPv4 + */ + if (replace_required) { + pr_warn("IPv6: Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("IPv6: NLM_F_CREATE should be set " + "when creating new route\n"); + } /* * We walked to the bottom of tree. * Create new leaf node without children. @@ -484,7 +512,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, ln = node_alloc(); - if (ln == NULL) + if (!ln) return NULL; ln->fn_bit = plen; @@ -527,7 +555,7 @@ insert_above: in = node_alloc(); ln = node_alloc(); - if (in == NULL || ln == NULL) { + if (!in || !ln) { if (in) node_free(in); if (ln) @@ -581,7 +609,7 @@ insert_above: ln = node_alloc(); - if (ln == NULL) + if (!ln) return NULL; ln->fn_bit = plen; @@ -614,10 +642,15 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, { struct rt6_info *iter = NULL; struct rt6_info **ins; + int replace = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int add = (!info->nlh || + (info->nlh->nlmsg_flags & NLM_F_CREATE)); + int found = 0; ins = &fn->leaf; - for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) { + for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { /* * Search for duplicates */ @@ -626,15 +659,22 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, /* * Same priority level */ + if (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_EXCL)) + return -EEXIST; + if (replace) { + found++; + break; + } if (iter->rt6i_dev == rt->rt6i_dev && iter->rt6i_idev == rt->rt6i_idev && ipv6_addr_equal(&iter->rt6i_gateway, &rt->rt6i_gateway)) { - if (!(iter->rt6i_flags&RTF_EXPIRES)) + if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; iter->rt6i_expires = rt->rt6i_expires; - if (!(rt->rt6i_flags&RTF_EXPIRES)) { + if (!(rt->rt6i_flags & RTF_EXPIRES)) { iter->rt6i_flags &= ~RTF_EXPIRES; iter->rt6i_expires = 0; } @@ -655,17 +695,40 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, /* * insert node */ + if (!replace) { + if (!add) + pr_warn("IPv6: NLM_F_CREATE should be set when creating new route\n"); + +add: + rt->dst.rt6_next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + info->nl_net->ipv6.rt6_stats->fib_rt_entries++; + + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } - rt->dst.rt6_next = iter; - *ins = rt; - rt->rt6i_node = fn; - atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); - info->nl_net->ipv6.rt6_stats->fib_rt_entries++; - - if ((fn->fn_flags & RTN_RTINFO) == 0) { - info->nl_net->ipv6.rt6_stats->fib_route_nodes++; - fn->fn_flags |= RTN_RTINFO; + } else { + if (!found) { + if (add) + goto add; + pr_warn("IPv6: NLM_F_REPLACE set, but no existing node found!\n"); + return -ENOENT; + } + *ins = rt; + rt->rt6i_node = fn; + rt->dst.rt6_next = iter->dst.rt6_next; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + rt6_release(iter); + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } } return 0; @@ -674,7 +737,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) + (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -696,11 +759,28 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; + int allow_create = 1; + int replace_required = 0; + + if (info->nlh) { + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) + allow_create = 0; + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) + replace_required = 1; + } + if (!allow_create && !replace_required) + pr_warn("IPv6: RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), + allow_create, replace_required); + + if (IS_ERR(fn)) { + err = PTR_ERR(fn); + fn = NULL; + } - if (fn == NULL) + if (!fn) goto out; pn = fn; @@ -709,7 +789,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (fn->subtree == NULL) { + if (!fn->subtree) { struct fib6_node *sfn; /* @@ -724,7 +804,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) /* Create subtree root node */ sfn = node_alloc(); - if (sfn == NULL) + if (!sfn) goto st_failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; @@ -736,9 +816,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) sn = fib6_add_1(sfn, &rt->rt6i_src.addr, sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) { + if (!sn) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node in main tree. @@ -753,13 +834,18 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) + if (IS_ERR(sn)) { + err = PTR_ERR(sn); + sn = NULL; + } + if (!sn) goto st_failure; } - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->leaf = rt; atomic_inc(&rt->rt6i_ref); } @@ -768,10 +854,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) #endif err = fib6_add_rt2node(fn, rt, info); - - if (err == 0) { + if (!err) { fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags&RTF_CACHE)) + if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn, rt); } @@ -819,7 +904,7 @@ st_failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ + int offset; /* key offset on rt6_info */ const struct in6_addr *addr; /* search key */ }; @@ -849,11 +934,10 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, fn = next; continue; } - break; } - while(fn) { + while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { struct rt6key *key; @@ -900,8 +984,7 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *da }; fn = fib6_lookup_1(root, daddr ? args : args + 1); - - if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; @@ -961,7 +1044,7 @@ struct fib6_node * fib6_locate(struct fib6_node *root, } #endif - if (fn && fn->fn_flags&RTN_RTINFO) + if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; @@ -975,14 +1058,13 @@ struct fib6_node * fib6_locate(struct fib6_node *root, static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) { - if (fn->fn_flags&RTN_ROOT) + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; - while(fn) { - if(fn->left) + while (fn) { + if (fn->left) return fn->left->leaf; - - if(fn->right) + if (fn->right) return fn->right->leaf; fn = FIB6_SUBTREE(fn); @@ -1020,12 +1102,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ - || (children && fn->fn_flags&RTN_ROOT) + || (children && fn->fn_flags & RTN_ROOT) #endif ) { fn->leaf = fib6_find_prefix(net, fn); #if RT6_DEBUG >= 2 - if (fn->leaf==NULL) { + if (!fn->leaf) { WARN_ON(!fn->leaf); fn->leaf = net->ipv6.ip6_null_entry; } @@ -1058,7 +1140,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&fib6_walker_lock); FOR_WALKERS(w) { - if (child == NULL) { + if (!child) { if (w->root == fn) { w->root = w->node = NULL; RT6_TRACE("W %p adjusted by delroot 1\n", w); @@ -1087,7 +1169,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_unlock(&fib6_walker_lock); node_free(fn); - if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) + if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; rt6_release(pn->leaf); @@ -1121,7 +1203,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rt->dst.rt6_next; - if (w->leaf == NULL) + if (!w->leaf) w->state = FWS_U; } } @@ -1130,7 +1212,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt->dst.rt6_next = NULL; /* If it was last route, expunge its radix tree node */ - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; fn = fib6_repair_tree(net, fn); @@ -1144,7 +1226,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { + if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { fn->leaf = fib6_find_prefix(net, fn); atomic_inc(&fn->leaf->rt6i_ref); rt6_release(rt); @@ -1171,17 +1253,17 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) return -ENOENT; } #endif - if (fn == NULL || rt == net->ipv6.ip6_null_entry) + if (!fn || rt == net->ipv6.ip6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags&RTF_CACHE)) { + if (!(rt->rt6i_flags & RTF_CACHE)) { struct fib6_node *pn = fn; #ifdef CONFIG_IPV6_SUBTREES /* clones of this route might be in another subtree */ if (rt->rt6i_src.plen) { - while (!(pn->fn_flags&RTN_ROOT)) + while (!(pn->fn_flags & RTN_ROOT)) pn = pn->parent; pn = pn->parent; } @@ -1232,11 +1314,11 @@ static int fib6_walk_continue(struct fib6_walker_t *w) for (;;) { fn = w->node; - if (fn == NULL) + if (!fn) return 0; if (w->prune && fn != w->root && - fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { w->state = FWS_C; w->leaf = fn->leaf; } @@ -1265,7 +1347,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->state = FWS_C; w->leaf = fn->leaf; case FWS_C: - if (w->leaf && fn->fn_flags&RTN_RTINFO) { + if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->count < w->skip) { @@ -1439,7 +1521,7 @@ static int fib6_age(struct rt6_info *rt, void *arg) * only if they are not in use now. */ - if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { + if (rt->rt6i_flags & RTF_EXPIRES && rt->rt6i_expires) { if (time_after(now, rt->rt6i_expires)) { RT6_TRACE("expiring %p\n", rt); return -1; @@ -1451,7 +1533,7 @@ static int fib6_age(struct rt6_info *rt, void *arg) RT6_TRACE("aging clone %p\n", rt); return -1; } else if ((rt->rt6i_flags & RTF_GATEWAY) && - (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) { + (!(dst_get_neighbour_noref_raw(&rt->dst)->flags & NTF_ROUTER))) { RT6_TRACE("purging route %p via non-router but gateway\n", rt); return -1; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 4566dbd916d3..b7867a1215b1 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -386,7 +386,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -EINVAL; goto done; } - ipv6_addr_copy(&fl->dst, &freq->flr_dst); + fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 027c7ff6f1e5..1ca5d45a12e8 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -111,6 +111,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt ipv6_addr_loopback(&hdr->daddr)) goto err; + /* + * RFC4291 2.7 + * Multicast addresses must not be used as source addresses in IPv6 + * packets or appear in any Routing header. + */ + if (ipv6_addr_is_multicast(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); @@ -272,6 +280,7 @@ int ip6_mc_input(struct sk_buff *skb) u8 *ptr = skb_network_header(skb) + opt->ra; struct icmp6hdr *icmp6; u8 nexthdr = hdr->nexthdr; + __be16 frag_off; int offset; /* Check if the value of Router Alert @@ -285,7 +294,7 @@ int ip6_mc_input(struct sk_buff *skb) goto out; } offset = ipv6_skip_exthdr(skb, sizeof(*hdr), - &nexthdr); + &nexthdr, &frag_off); if (offset < 0) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 84d0bd5cac93..71d26999c955 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -136,7 +136,7 @@ static int ip6_finish_output2(struct sk_buff *skb) } rcu_read_lock(); - neigh = dst_get_neighbour(dst); + neigh = dst_get_neighbour_noref(dst); if (neigh) { int res = neigh_output(neigh, skb); @@ -238,8 +238,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hdr->nexthdr = proto; hdr->hop_limit = hlimit; - ipv6_addr_copy(&hdr->saddr, &fl6->saddr); - ipv6_addr_copy(&hdr->daddr, first_hop); + hdr->saddr = fl6->saddr; + hdr->daddr = *first_hop; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -290,8 +290,8 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, hdr->nexthdr = proto; hdr->hop_limit = np->hop_limit; - ipv6_addr_copy(&hdr->saddr, saddr); - ipv6_addr_copy(&hdr->daddr, daddr); + hdr->saddr = *saddr; + hdr->daddr = *daddr; return 0; } @@ -329,10 +329,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; + __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { - offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr); + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else @@ -462,7 +463,7 @@ int ip6_forward(struct sk_buff *skb) send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - n = dst_get_neighbour(dst); + n = dst_get_neighbour_noref(dst); if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct rt6_info *rt; @@ -631,6 +632,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; + int hroom, troom; __be32 frag_id = 0; int ptr, offset = 0, err=0; u8 *prevhdr, nexthdr = 0; @@ -797,6 +799,8 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; + hroom = LL_RESERVED_SPACE(rt->dst.dev); + troom = rt->dst.dev->needed_tailroom; /* * Keep copying data until we run out. @@ -815,7 +819,8 @@ slow_path: * Allocate buffer. */ - if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) { + if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC)) == NULL) { NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); @@ -828,7 +833,7 @@ slow_path: */ ip6_copy_metadata(frag, skb); - skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev)); + skb_reserve(frag, hroom); skb_put(frag, len + hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); @@ -978,7 +983,7 @@ static int ip6_dst_lookup_tail(struct sock *sk, * dst entry of the nexthop router */ rcu_read_lock(); - n = dst_get_neighbour(*dst); + n = dst_get_neighbour_noref(*dst); if (n && !(n->nud_state & NUD_VALID)) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; @@ -1059,7 +1064,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (err) return ERR_PTR(err); if (final_dst) - ipv6_addr_copy(&fl6->daddr, final_dst); + fl6->daddr = *final_dst; if (can_sleep) fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; @@ -1095,7 +1100,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (err) return ERR_PTR(err); if (final_dst) - ipv6_addr_copy(&fl6->daddr, final_dst); + fl6->daddr = *final_dst; if (can_sleep) fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; @@ -1588,7 +1593,7 @@ int ip6_push_pending_frames(struct sock *sk) if (np->pmtudisc < IPV6_PMTUDISC_DO) skb->local_df = 1; - ipv6_addr_copy(final_dst, &fl6->daddr); + *final_dst = fl6->daddr; __skb_pull(skb, skb_network_header_len(skb)); if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -1604,8 +1609,8 @@ int ip6_push_pending_frames(struct sock *sk) hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; - ipv6_addr_copy(&hdr->saddr, &fl6->saddr); - ipv6_addr_copy(&hdr->daddr, final_dst); + hdr->saddr = fl6->saddr; + hdr->daddr = *final_dst; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index bdc15c9003d7..f5f98f558acb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -93,7 +93,7 @@ struct pcpu_tstats { unsigned long rx_bytes; unsigned long tx_packets; unsigned long tx_bytes; -}; +} __attribute__((aligned(4*sizeof(unsigned long)))); static struct net_device_stats *ip6_get_stats(struct net_device *dev) { @@ -289,6 +289,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) if ((err = register_netdevice(dev)) < 0) goto failed_free; + strcpy(t->parms.name, dev->name); + dev_hold(dev); ip6_tnl_link(ip6n, t); return t; @@ -977,8 +979,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; - ipv6_addr_copy(&ipv6h->saddr, &fl6->saddr); - ipv6_addr_copy(&ipv6h->daddr, &fl6->daddr); + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; nf_reset(skb); pkt_len = skb->len; err = ip6_local_out(skb); @@ -1153,8 +1155,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ - ipv6_addr_copy(&fl6->saddr, &p->laddr); - ipv6_addr_copy(&fl6->daddr, &p->raddr); + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; @@ -1210,8 +1212,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) static int ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) { - ipv6_addr_copy(&t->parms.laddr, &p->laddr); - ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; @@ -1407,7 +1409,6 @@ ip6_tnl_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; - strcpy(t->parms.name, dev->name); dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; @@ -1487,6 +1488,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; @@ -1507,6 +1509,10 @@ static int __net_init ip6_tnl_init_net(struct net *net) err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 449a9185b8f2..c7e95c8c579f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1105,8 +1105,8 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, msg->im6_msgtype = MRT6MSG_WHOLEPKT; msg->im6_mif = mrt->mroute_reg_vif_num; msg->im6_pad = 0; - ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); - ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else @@ -1131,8 +1131,8 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; - ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); - ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -2181,8 +2181,8 @@ int ip6mr_get_route(struct net *net, iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; - ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr); - ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr); + iph->saddr = rt->rt6i_src.addr; + iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c99e3ee9781f..18a2719003c3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -435,7 +435,7 @@ sticky_done: goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; - ipv6_addr_copy(&np->sticky_pktinfo.ipi6_addr, &pkt.ipi6_addr); + np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } @@ -503,7 +503,7 @@ done: goto e_inval; if (val > 255 || val < -1) goto e_inval; - np->mcast_hops = val; + np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); retv = 0; break; @@ -980,8 +980,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; - np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : - ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { @@ -992,8 +991,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; - np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : - ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index ee7839f4d6e3..518cbb90c44b 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -155,7 +155,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -ENOMEM; mc_lst->next = NULL; - ipv6_addr_copy(&mc_lst->addr, addr); + mc_lst->addr = *addr; rcu_read_lock(); if (ifindex == 0) { @@ -858,7 +858,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); - ipv6_addr_copy(&mc->mca_addr, addr); + mc->mca_addr = *addr; mc->idev = idev; /* (reference taken) */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ @@ -1343,13 +1343,15 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) struct mld2_report *pmr; struct in6_addr addr_buf; const struct in6_addr *saddr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - size += LL_ALLOCATED_SPACE(dev); + size += hlen + tlen; /* limit our allocations to order-0 page */ size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); @@ -1357,7 +1359,7 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) if (!skb) return NULL; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: @@ -1723,6 +1725,8 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) struct mld_msg *hdr; const struct in6_addr *snd_addr, *saddr; struct in6_addr addr_buf; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, @@ -1744,7 +1748,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPSTATS_MIB_OUT, full_len); rcu_read_unlock(); - skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + full_len, 1, &err); + skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); if (skb == NULL) { rcu_read_lock(); @@ -1754,7 +1758,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) return; } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: @@ -1772,7 +1776,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg)); memset(hdr, 0, sizeof(struct mld_msg)); hdr->mld_type = type; - ipv6_addr_copy(&hdr->mld_mca, addr); + hdr->mld_mca = *addr; hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, IPPROTO_ICMPV6, @@ -1914,7 +1918,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, * Add multicast single-source filter to the interface list */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, - const struct in6_addr *psfsrc, int delta) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; @@ -2045,7 +2049,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfcount[sfmode]++; err = 0; for (i=0; i<sfcount; i++) { - err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 43242e6e6103..7e1e0fbfef21 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -195,8 +195,8 @@ static inline int mip6_report_rl_allow(struct timeval *stamp, mip6_report_rl.stamp.tv_sec = stamp->tv_sec; mip6_report_rl.stamp.tv_usec = stamp->tv_usec; mip6_report_rl.iif = iif; - ipv6_addr_copy(&mip6_report_rl.src, src); - ipv6_addr_copy(&mip6_report_rl.dst, dst); + mip6_report_rl.src = *src; + mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 44e5b7f2a6c1..e72c8af85781 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -126,7 +126,6 @@ static const struct neigh_ops ndisc_direct_ops = { struct neigh_table nd_tbl = { .family = AF_INET6, - .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), .key_len = sizeof(struct in6_addr), .hash = ndisc_hash, .constructor = ndisc_constructor, @@ -141,7 +140,7 @@ struct neigh_table nd_tbl = { .gc_staletime = 60 * HZ, .reachable_time = ND_REACHABLE_TIME, .delay_probe_time = 5 * HZ, - .queue_len = 3, + .queue_len_bytes = 64*1024, .ucast_probes = 3, .mcast_probes = 3, .anycast_delay = 1 * HZ, @@ -446,6 +445,8 @@ struct sk_buff *ndisc_build_skb(struct net_device *dev, struct sock *sk = net->ipv6.ndisc_sk; struct sk_buff *skb; struct icmp6hdr *hdr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int len; int err; u8 *opt; @@ -459,7 +460,7 @@ struct sk_buff *ndisc_build_skb(struct net_device *dev, skb = sock_alloc_send_skb(sk, (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_ALLOCATED_SPACE(dev)), + len + hlen + tlen), 1, &err); if (!skb) { ND_PRINTK0(KERN_ERR @@ -468,7 +469,7 @@ struct sk_buff *ndisc_build_skb(struct net_device *dev, return NULL; } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); skb->transport_header = skb->tail; @@ -479,7 +480,7 @@ struct sk_buff *ndisc_build_skb(struct net_device *dev, opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); if (target) { - ipv6_addr_copy((struct in6_addr *)opt, target); + *(struct in6_addr *)opt = *target; opt += sizeof(*target); } @@ -1237,7 +1238,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); if (rt) - neigh = dst_get_neighbour(&rt->dst); + neigh = dst_get_neighbour_noref(&rt->dst); if (rt && lifetime == 0) { neigh_clone(neigh); @@ -1257,7 +1258,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = dst_get_neighbour(&rt->dst); + neigh = dst_get_neighbour_noref(&rt->dst); if (neigh == NULL) { ND_PRINTK0(KERN_ERR "ICMPv6 RA: %s() got default router without neighbour.\n", @@ -1533,6 +1534,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, struct inet6_dev *idev; struct flowi6 fl6; u8 *opt; + int hlen, tlen; int rd_len; int err; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; @@ -1571,7 +1573,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, } if (!rt->rt6i_peer) rt6_bind_peer(rt, 1); - if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + if (!inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) goto release; if (dev->addr_len) { @@ -1590,9 +1592,11 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, rd_len &= ~0x7; len += rd_len; + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; buff = sock_alloc_send_skb(sk, (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_ALLOCATED_SPACE(dev)), + len + hlen + tlen), 1, &err); if (buff == NULL) { ND_PRINTK0(KERN_ERR @@ -1601,7 +1605,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, goto release; } - skb_reserve(buff, LL_RESERVED_SPACE(dev)); + skb_reserve(buff, hlen); ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, IPPROTO_ICMPV6, len); @@ -1617,9 +1621,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, */ addrp = (struct in6_addr *)(icmph + 1); - ipv6_addr_copy(addrp, target); + *addrp = *target; addrp++; - ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr); + *addrp = ipv6_hdr(skb)->daddr; opt = (u8*) (addrp + 1); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 448464844a25..f792b34cbe9c 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -186,7 +186,6 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' - depends on NETFILTER_ADVANCED help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index e63c3972a739..fb80a23c6640 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -405,6 +405,7 @@ __ipq_rcv_skb(struct sk_buff *skb) int status, type, pid, flags; unsigned int nlmsglen, skblen; struct nlmsghdr *nlh; + bool enable_timestamp = false; skblen = skb->len; if (skblen < sizeof(*nlh)) @@ -442,11 +443,13 @@ __ipq_rcv_skb(struct sk_buff *skb) RCV_SKB_FAIL(-EBUSY); } } else { - net_enable_timestamp(); + enable_timestamp = true; peer_pid = pid; } spin_unlock_bh(&queue_lock); + if (enable_timestamp) + net_enable_timestamp(); status = ipq_receive_peer(NLMSG_DATA(nlh), type, nlmsglen - NLMSG_LENGTH(0)); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index a5a4c5dd5396..aad2fa41cf46 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -49,6 +49,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) const __u8 tclass = DEFAULT_TOS_VALUE; struct dst_entry *dst = NULL; u8 proto; + __be16 frag_off; struct flowi6 fl6; if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || @@ -58,7 +59,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) } proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); if ((tcphoff < 0) || (tcphoff > oldskb->len)) { pr_debug("Cannot get TCP header.\n"); @@ -93,8 +94,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.saddr, &oip6h->daddr); - ipv6_addr_copy(&fl6.daddr, &oip6h->saddr); + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph.dest; fl6.fl6_dport = otcph.source; security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); @@ -129,8 +130,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; - ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); - ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); + ip6h->saddr = oip6h->daddr; + ip6h->daddr = oip6h->saddr; tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 1008ce94bc33..fdeb6d03da81 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -142,11 +142,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_SENTINEL }; -/* can be called either with percpu mib (pcpumib != NULL), - * or shared one (smib != NULL) - */ -static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void __percpu **pcpumib, - atomic_long_t *smib) +static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; int i; @@ -163,14 +159,14 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void __percpu **pcpum snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, - pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i)); + atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; - val = pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i); + val = atomic_long_read(smib + i); if (!val) continue; snprintf(name, sizeof(name), "Icmp6%sType%u", @@ -215,8 +211,7 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics, NULL, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, - (void __percpu **)net->mib.icmpv6msg_statistics, NULL); + snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6, NULL, snmp6_udp6_list); snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6, @@ -246,7 +241,7 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) snmp6_ipstats_list); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, NULL, idev->stats.icmpv6msgdev->mibs); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 331af3b882ac..a4894f4f1944 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -299,9 +299,9 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) } inet->inet_rcv_saddr = inet->inet_saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + np->rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; err = 0; out_unlock: rcu_read_unlock(); @@ -383,7 +383,8 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) } /* Charge it to the socket. */ - if (ip_queue_rcv_skb(sk, skb) < 0) { + skb_dst_drop(skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } @@ -494,7 +495,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; - ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) @@ -610,6 +611,8 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, struct sk_buff *skb; int err; struct rt6_info *rt = (struct rt6_info *)*dstp; + int hlen = LL_RESERVED_SPACE(rt->dst.dev); + int tlen = rt->dst.dev->needed_tailroom; if (length > rt->dst.dev->mtu) { ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); @@ -619,11 +622,11 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, + length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); + skb_reserve(skb, hlen); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -843,11 +846,11 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, goto out; if (!ipv6_addr_any(daddr)) - ipv6_addr_copy(&fl6.daddr, daddr); + fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index dfb164e9051a..b69fae76a6f1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -153,8 +153,8 @@ void ip6_frag_init(struct inet_frag_queue *q, void *a) fq->id = arg->id; fq->user = arg->user; - ipv6_addr_copy(&fq->saddr, arg->src); - ipv6_addr_copy(&fq->daddr, arg->dst); + fq->saddr = *arg->src; + fq->daddr = *arg->dst; } EXPORT_SYMBOL(ip6_frag_init); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8473016bba4a..09412baf1ca6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -77,7 +77,7 @@ static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, const struct in6_addr *dest); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); -static unsigned int ip6_default_mtu(const struct dst_entry *dst); +static unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -144,7 +144,7 @@ static struct dst_ops ip6_dst_ops_template = { .gc_thresh = 1024, .check = ip6_dst_check, .default_advmss = ip6_default_advmss, - .default_mtu = ip6_default_mtu, + .mtu = ip6_mtu, .cow_metrics = ipv6_cow_metrics, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, @@ -155,9 +155,11 @@ static struct dst_ops ip6_dst_ops_template = { .neigh_lookup = ip6_neigh_lookup, }; -static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst) +static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) { - return 0; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; } static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -175,7 +177,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, - .default_mtu = ip6_blackhole_default_mtu, + .mtu = ip6_blackhole_mtu, .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, .cow_metrics = ip6_rt_blackhole_cow_metrics, @@ -245,9 +247,9 @@ static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, { struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); - if (rt != NULL) + if (rt) memset(&rt->rt6i_table, 0, - sizeof(*rt) - sizeof(struct dst_entry)); + sizeof(*rt) - sizeof(struct dst_entry)); return rt; } @@ -261,7 +263,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) if (!(rt->dst.flags & DST_HOST)) dst_destroy_metrics_generic(dst); - if (idev != NULL) { + if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } @@ -297,10 +299,10 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct net_device *loopback_dev = dev_net(dev)->loopback_dev; - if (dev != loopback_dev && idev != NULL && idev->dev == dev) { + if (dev != loopback_dev && idev && idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); - if (loopback_idev != NULL) { + if (loopback_idev) { rt->rt6i_idev = loopback_idev; in6_dev_put(idev); } @@ -342,7 +344,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (dev->ifindex == oif) return sprt; if (dev->flags & IFF_LOOPBACK) { - if (sprt->rt6i_idev == NULL || + if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { if (flags & RT6_LOOKUP_F_IFACE && oif) continue; @@ -383,7 +385,7 @@ static void rt6_probe(struct rt6_info *rt) * to no more than one per minute. */ rcu_read_lock(); - neigh = rt ? dst_get_neighbour(&rt->dst) : NULL; + neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL; if (!neigh || (neigh->nud_state & NUD_VALID)) goto out; read_lock_bh(&neigh->lock); @@ -430,7 +432,7 @@ static inline int rt6_check_neigh(struct rt6_info *rt) int m; rcu_read_lock(); - neigh = dst_get_neighbour(&rt->dst); + neigh = dst_get_neighbour_noref(&rt->dst); if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) m = 1; @@ -634,7 +636,7 @@ do { \ goto restart; \ } \ } \ -} while(0) +} while (0) static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, @@ -725,24 +727,25 @@ static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort, struct neighbour *neigh; int attempts = !in_softirq(); - if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (!(rt->rt6i_flags & RTF_GATEWAY)) { if (rt->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; - ipv6_addr_copy(&rt->rt6i_gateway, daddr); + rt->rt6i_gateway = *daddr; } rt->rt6i_flags |= RTF_CACHE; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { - ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif retry: - neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, + rt->rt6i_dev); if (IS_ERR(neigh)) { struct net *net = dev_net(rt->rt6i_dev); int saved_rt_min_interval = @@ -783,7 +786,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, if (rt) { rt->rt6i_flags |= RTF_CACHE; - dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst))); + dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst))); } return rt; } @@ -817,7 +820,7 @@ restart: dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); else if (!(rt->dst.flags & DST_HOST)) nrt = rt6_alloc_clone(rt, &fl6->daddr); @@ -873,7 +876,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, - .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK, + .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; @@ -932,7 +935,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori in6_dev_hold(rt->rt6i_idev); rt->rt6i_expires = 0; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; rt->rt6i_metric = 0; @@ -995,7 +998,7 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { - if (rt->rt6i_flags&RTF_CACHE) { + if (rt->rt6i_flags & RTF_CACHE) { dst_set_expires(&rt->dst, 0); rt->rt6i_flags |= RTF_EXPIRES; } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) @@ -1041,10 +1044,15 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) return mtu; } -static unsigned int ip6_default_mtu(const struct dst_entry *dst) +static unsigned int ip6_mtu(const struct dst_entry *dst) { - unsigned int mtu = IPV6_MIN_MTU; struct inet6_dev *idev; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + return mtu; + + mtu = IPV6_MIN_MTU; rcu_read_lock(); idev = __in6_dev_get(dst->dev); @@ -1066,11 +1074,11 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); - if (unlikely(idev == NULL)) + if (unlikely(!idev)) return NULL; rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); - if (unlikely(rt == NULL)) { + if (unlikely(!rt)) { in6_dev_put(idev); goto out; } @@ -1078,7 +1086,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, if (neigh) neigh_hold(neigh); else { - neigh = ndisc_get_neigh(dev, addr); + neigh = __neigh_lookup_errno(&nd_tbl, addr, dev); if (IS_ERR(neigh)) neigh = NULL; } @@ -1087,7 +1095,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->dst.output = ip6_output; dst_set_neighbour(&rt->dst, neigh); atomic_set(&rt->dst.__refcnt, 1); - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); @@ -1230,15 +1238,24 @@ int ip6_route_add(struct fib6_config *cfg) if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; - table = fib6_new_table(net, cfg->fc_table); - if (table == NULL) { - err = -ENOBUFS; - goto out; + err = -ENOBUFS; + if (cfg->fc_nlinfo.nlh && + !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { + table = fib6_get_table(net, cfg->fc_table); + if (!table) { + printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n"); + table = fib6_new_table(net, cfg->fc_table); + } + } else { + table = fib6_new_table(net, cfg->fc_table); } + if (!table) + goto out; + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT); - if (rt == NULL) { + if (!rt) { err = -ENOMEM; goto out; } @@ -1287,8 +1304,9 @@ int ip6_route_add(struct fib6_config *cfg) they would result in kernel looping; promote them to reject routes */ if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK) - && !(cfg->fc_flags&RTF_LOCAL))) { + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(cfg->fc_flags & RTF_LOCAL))) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { @@ -1315,7 +1333,7 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; - ipv6_addr_copy(&rt->rt6i_gateway, gw_addr); + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { @@ -1329,13 +1347,13 @@ int ip6_route_add(struct fib6_config *cfg) some exceptions. --ANK */ err = -EINVAL; - if (!(gwa_type&IPV6_ADDR_UNICAST)) + if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); err = -EHOSTUNREACH; - if (grt == NULL) + if (!grt) goto out; if (dev) { if (dev != grt->rt6i_dev) { @@ -1348,7 +1366,7 @@ int ip6_route_add(struct fib6_config *cfg) dev_hold(dev); in6_dev_hold(grt->rt6i_idev); } - if (!(grt->rt6i_flags&RTF_GATEWAY)) + if (!(grt->rt6i_flags & RTF_GATEWAY)) err = 0; dst_release(&grt->dst); @@ -1356,12 +1374,12 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } err = -EINVAL; - if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + if (!dev || (dev->flags & IFF_LOOPBACK)) goto out; } err = -ENODEV; - if (dev == NULL) + if (!dev) goto out; if (!ipv6_addr_any(&cfg->fc_prefsrc)) { @@ -1369,7 +1387,7 @@ int ip6_route_add(struct fib6_config *cfg) err = -EINVAL; goto out; } - ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc); + rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; rt->rt6i_prefsrc.plen = 128; } else rt->rt6i_prefsrc.plen = 0; @@ -1458,7 +1476,7 @@ static int ip6_route_del(struct fib6_config *cfg) int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); - if (table == NULL) + if (!table) return err; read_lock_bh(&table->tb6_lock); @@ -1470,7 +1488,7 @@ static int ip6_route_del(struct fib6_config *cfg) if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (cfg->fc_ifindex && - (rt->rt6i_dev == NULL || + (!rt->rt6i_dev || rt->rt6i_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && @@ -1566,7 +1584,7 @@ static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest, }, }; - ipv6_addr_copy(&rdfl.gateway, gateway); + rdfl.gateway = *gateway; if (rt6_need_strict(dest)) flags |= RT6_LOOKUP_F_IFACE; @@ -1611,18 +1629,18 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, dst_confirm(&rt->dst); /* Duplicate redirect: silently ignore. */ - if (neigh == dst_get_neighbour_raw(&rt->dst)) + if (neigh == dst_get_neighbour_noref_raw(&rt->dst)) goto out; nrt = ip6_rt_copy(rt, dest); - if (nrt == NULL) + if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); + nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; dst_set_neighbour(&nrt->dst, neigh_clone(neigh)); if (ip6_ins_rt(nrt)) @@ -1632,7 +1650,7 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, netevent.new = &nrt->dst; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags&RTF_CACHE) { + if (rt->rt6i_flags & RTF_CACHE) { ip6_del_rt(rt); return; } @@ -1653,7 +1671,7 @@ static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr int allfrag = 0; again: rt = rt6_lookup(net, daddr, saddr, ifindex, 0); - if (rt == NULL) + if (!rt) return; if (rt6_check_expired(rt)) { @@ -1703,7 +1721,7 @@ again: 1. It is connected route. Action: COW 2. It is gatewayed route or NONEXTHOP route. Action: clone it. */ - if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) nrt = rt6_alloc_cow(rt, daddr, saddr); else nrt = rt6_alloc_clone(rt, daddr); @@ -1768,7 +1786,7 @@ static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, rt->dst.output = ort->dst.output; rt->dst.flags |= DST_HOST; - ipv6_addr_copy(&rt->rt6i_dst.addr, dest); + rt->rt6i_dst.addr = *dest; rt->rt6i_dst.plen = 128; dst_copy_metrics(&rt->dst, &ort->dst); rt->dst.error = ort->dst.error; @@ -1778,7 +1796,7 @@ static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort, rt->dst.lastuse = jiffies; rt->rt6i_expires = 0; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; rt->rt6i_metric = 0; @@ -1801,7 +1819,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, struct fib6_table *table; table = fib6_get_table(net, RT6_TABLE_INFO); - if (table == NULL) + if (!table) return NULL; write_lock_bh(&table->tb6_lock); @@ -1841,8 +1859,8 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_nlinfo.nl_net = net, }; - ipv6_addr_copy(&cfg.fc_dst, prefix); - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_dst = *prefix; + cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) @@ -1860,7 +1878,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev struct fib6_table *table; table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); - if (table == NULL) + if (!table) return NULL; write_lock_bh(&table->tb6_lock); @@ -1891,7 +1909,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_nlinfo.nl_net = dev_net(dev), }; - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_gateway = *gwaddr; ip6_route_add(&cfg); @@ -1905,7 +1923,7 @@ void rt6_purge_dflt_routers(struct net *net) /* NOTE: Keep consistent with rt6_get_dflt_router */ table = fib6_get_table(net, RT6_TABLE_DFLT); - if (table == NULL) + if (!table) return; restart: @@ -1937,9 +1955,9 @@ static void rtmsg_to_fib6_config(struct net *net, cfg->fc_nlinfo.nl_net = net; - ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst); - ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src); - ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway); + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) @@ -2045,7 +2063,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, net->loopback_dev, 0); struct neighbour *neigh; - if (rt == NULL) { + if (!rt) { if (net_ratelimit()) pr_warning("IPv6: Maximum number of routes reached," " consider increasing route/max_size.\n"); @@ -2065,7 +2083,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); if (IS_ERR(neigh)) { dst_free(&rt->dst); @@ -2073,7 +2091,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, } dst_set_neighbour(&rt->dst, neigh); - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); @@ -2091,7 +2109,7 @@ int ip6_route_get_saddr(struct net *net, struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); int err = 0; if (rt->rt6i_prefsrc.plen) - ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr); + *saddr = rt->rt6i_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, prefs, saddr); @@ -2111,7 +2129,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->rt6i_dev == dev || dev == NULL) && + if (((void *)rt->rt6i_dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { /* remove prefsrc entry */ @@ -2141,7 +2159,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg) const struct arg_dev_net *adn = arg; const struct net_device *dev = adn->dev; - if ((rt->rt6i_dev == dev || dev == NULL) && + if ((rt->rt6i_dev == dev || !dev) && rt != adn->net->ipv6.ip6_null_entry) { RT6_TRACE("deleted by ifdown %p\n", rt); return -1; @@ -2178,7 +2196,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) */ idev = __in6_dev_get(arg->dev); - if (idev == NULL) + if (!idev) return 0; /* For administrative MTU increase, there is no way to discover @@ -2358,7 +2376,7 @@ static int rt6_fill_node(struct net *net, } nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -2372,25 +2390,25 @@ static int rt6_fill_node(struct net *net, table = RT6_TABLE_UNSPEC; rtm->rtm_table = table; NLA_PUT_U32(skb, RTA_TABLE, table); - if (rt->rt6i_flags&RTF_REJECT) + if (rt->rt6i_flags & RTF_REJECT) rtm->rtm_type = RTN_UNREACHABLE; - else if (rt->rt6i_flags&RTF_LOCAL) + else if (rt->rt6i_flags & RTF_LOCAL) rtm->rtm_type = RTN_LOCAL; - else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags&RTF_DYNAMIC) + if (rt->rt6i_flags & RTF_DYNAMIC) rtm->rtm_protocol = RTPROT_REDIRECT; else if (rt->rt6i_flags & RTF_ADDRCONF) rtm->rtm_protocol = RTPROT_KERNEL; - else if (rt->rt6i_flags&RTF_DEFAULT) + else if (rt->rt6i_flags & RTF_DEFAULT) rtm->rtm_protocol = RTPROT_RA; - if (rt->rt6i_flags&RTF_CACHE) + if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dst) { @@ -2430,7 +2448,7 @@ static int rt6_fill_node(struct net *net, if (rt->rt6i_prefsrc.plen) { struct in6_addr saddr_buf; - ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr); + saddr_buf = rt->rt6i_prefsrc.addr; NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); } @@ -2438,7 +2456,7 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; rcu_read_lock(); - n = dst_get_neighbour(&rt->dst); + n = dst_get_neighbour_noref(&rt->dst); if (n) NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key); rcu_read_unlock(); @@ -2504,14 +2522,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC])); + fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST])); + fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) @@ -2530,7 +2548,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (skb == NULL) { + if (!skb) { err = -ENOBUFS; goto errout; } @@ -2565,10 +2583,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) int err; err = -ENOBUFS; - seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; + seq = info->nlh ? info->nlh->nlmsg_seq : 0; skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); - if (skb == NULL) + if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, @@ -2635,7 +2653,7 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg) seq_puts(m, "00000000000000000000000000000000 00 "); #endif rcu_read_lock(); - n = dst_get_neighbour(&rt->dst); + n = dst_get_neighbour_noref(&rt->dst); if (n) { seq_printf(m, "%pi6", n->primary_key); } else { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a7a18602a046..b7d14cc12ee8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -91,7 +91,7 @@ struct pcpu_tstats { unsigned long rx_bytes; unsigned long tx_packets; unsigned long tx_bytes; -}; +} __attribute__((aligned(4*sizeof(unsigned long)))); static struct net_device_stats *ipip6_get_stats(struct net_device *dev) { @@ -680,7 +680,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct neighbour *neigh = NULL; if (skb_dst(skb)) - neigh = dst_get_neighbour(skb_dst(skb)); + neigh = dst_get_neighbour_noref(skb_dst(skb)); if (neigh == NULL) { if (net_ratelimit()) @@ -705,7 +705,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct neighbour *neigh = NULL; if (skb_dst(skb)) - neigh = dst_get_neighbour(skb_dst(skb)); + neigh = dst_get_neighbour_noref(skb_dst(skb)); if (neigh == NULL) { if (net_ratelimit()) @@ -914,7 +914,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) goto done; #ifdef CONFIG_IPV6_SIT_6RD } else { - ipv6_addr_copy(&ip6rd.prefix, &t->ip6rd.prefix); + ip6rd.prefix = t->ip6rd.prefix; ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; @@ -1082,7 +1082,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (relay_prefix != ip6rd.relay_prefix) goto done; - ipv6_addr_copy(&t->ip6rd.prefix, &prefix); + t->ip6rd.prefix = prefix; t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd.prefixlen; t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 5a0d6648bbbc..8e951d8d3b81 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -200,8 +200,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->mss = mss; ireq->rmt_port = th->source; ireq->loc_port = th->dest; - ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); + ireq6->rmt_addr = ipv6_hdr(skb)->saddr; + ireq6->loc_addr = ipv6_hdr(skb)->daddr; if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -237,9 +237,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + fl6.daddr = ireq6->rmt_addr; final_p = fl6_update_dst(&fl6, np->opt, &final); - ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.saddr = ireq6->loc_addr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet_rsk(req)->rmt_port; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 36131d122a6f..9d74eee334d6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -153,7 +153,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } @@ -195,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->write_seq = 0; } - ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* @@ -244,9 +244,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, saddr = &np->rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, - (saddr ? saddr : &np->saddr)); + fl6.daddr = np->daddr; + fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; @@ -264,11 +263,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (saddr == NULL) { saddr = &fl6.saddr; - ipv6_addr_copy(&np->rcv_saddr, saddr); + np->rcv_saddr = *saddr; } /* set the source address */ - ipv6_addr_copy(&np->saddr, saddr); + np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; @@ -398,8 +397,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.daddr, &np->daddr); - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; @@ -489,8 +488,8 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; - ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); - ipv6_addr_copy(&fl6.saddr, &treq->loc_addr); + fl6.daddr = treq->rmt_addr; + fl6.saddr = treq->loc_addr; fl6.flowlabel = 0; fl6.flowi6_oif = treq->iif; fl6.flowi6_mark = sk->sk_mark; @@ -512,7 +511,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, if (skb) { __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); - ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); + fl6.daddr = treq->rmt_addr; err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); err = net_xmit_eval(err); } @@ -617,8 +616,7 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, tp->md5sig_info->alloced6++; } - ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr, - peer); + tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr = *peer; tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.key = newkey; tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.keylen = newkeylen; @@ -750,8 +748,8 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, bp = &hp->md5_blk.ip6; /* 1. TCP pseudo-header (RFC2460) */ - ipv6_addr_copy(&bp->saddr, saddr); - ipv6_addr_copy(&bp->daddr, daddr); + bp->saddr = *saddr; + bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); @@ -1039,8 +1037,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, #endif memset(&fl6, 0, sizeof(fl6)); - ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&fl6.saddr, &ipv6_hdr(skb)->daddr); + fl6.daddr = ipv6_hdr(skb)->saddr; + fl6.saddr = ipv6_hdr(skb)->daddr; buff->ip_summed = CHECKSUM_PARTIAL; buff->csum = 0; @@ -1250,11 +1248,18 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb); treq = inet6_rsk(req); - ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); + treq->rmt_addr = ipv6_hdr(skb)->saddr; + treq->loc_addr = ipv6_hdr(skb)->daddr; if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, tcp_hdr(skb)); + treq->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) + treq->iif = inet6_iif(skb); + if (!isn) { struct inet_peer *peer = NULL; @@ -1264,12 +1269,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) atomic_inc(&skb->users); treq->pktopts = skb; } - treq->iif = sk->sk_bound_dev_if; - - /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && - ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) - treq->iif = inet6_iif(skb); if (want_cookie) { isn = cookie_v6_init_sequence(sk, skb, &req->mss); @@ -1380,7 +1379,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + newnp->rcv_saddr = newnp->saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -1444,9 +1443,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr); - ipv6_addr_copy(&newnp->saddr, &treq->loc_addr); - ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr); + newnp->daddr = treq->rmt_addr; + newnp->saddr = treq->loc_addr; + newnp->rcv_saddr = treq->loc_addr; newsk->sk_bound_dev_if = treq->iif; /* Now IPv6 options... diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 846f4757eb8d..adfe26a7fc63 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -340,7 +340,7 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - unsigned int ulen; + unsigned int ulen, copied; int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -363,9 +363,10 @@ try_again: goto out; ulen = skb->len - sizeof(struct udphdr); - if (len > ulen) - len = ulen; - else if (len < ulen) + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; is_udp4 = (skb->protocol == htons(ETH_P_IP)); @@ -376,14 +377,14 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { if (udp_lib_checksum_complete(skb)) goto csum_copy_err; } if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov,len); + msg->msg_iov, copied ); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) @@ -417,8 +418,7 @@ try_again: ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin6->sin6_addr); else { - ipv6_addr_copy(&sin6->sin6_addr, - &ipv6_hdr(skb)->saddr); + sin6->sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6->sin6_scope_id = IP6CB(skb)->iif; } @@ -432,7 +432,7 @@ try_again: datagram_recv_ctl(sk, msg, skb); } - err = len; + err = copied; if (flags & MSG_TRUNC) err = ulen; @@ -538,7 +538,9 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } - if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) { + skb_dst_drop(skb); + rc = sock_queue_rcv_skb(sk, skb); + if (rc < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) UDP6_INC_STATS_BH(sock_net(sk), @@ -1113,11 +1115,11 @@ do_udp_sendmsg: fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) - ipv6_addr_copy(&fl6.daddr, daddr); + fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; final_p = fl6_update_dst(&fl6, opt, &final); @@ -1298,7 +1300,8 @@ static int udp6_ufo_send_check(struct sk_buff *skb) return 0; } -static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features) +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index 3437d7d4eed6..a81ce9450750 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -72,8 +72,8 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->nexthdr = IPPROTO_BEETPH; } - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } @@ -99,8 +99,8 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); - ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; + ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 4d6edff0498f..261e6e6f487e 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -55,8 +55,8 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); top_iph->hop_limit = ip6_dst_hoplimit(dst->child); - ipv6_addr_copy(&top_iph->saddr, (const struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (const struct in6_addr *)&x->id.daddr); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index faae41737fca..4eeff89c1aaa 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -49,7 +49,7 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) struct sock *sk = skb->sk; fl6.flowi6_oif = sk->sk_bound_dev_if; - ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->daddr); + fl6.daddr = ipv6_hdr(skb)->daddr; ipv6_local_rxpmtu(sk, &fl6, mtu); } @@ -60,7 +60,7 @@ static void xfrm6_local_error(struct sk_buff *skb, u32 mtu) struct sock *sk = skb->sk; fl6.fl6_dport = inet_sk(sk)->inet_dport; - ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->daddr); + fl6.daddr = ipv6_hdr(skb)->daddr; ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index d879f7efbd10..8ea65e032733 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -132,8 +132,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; - ipv6_addr_copy(&fl6->daddr, reverse ? &hdr->saddr : &hdr->daddr); - ipv6_addr_copy(&fl6->saddr, reverse ? &hdr->daddr : &hdr->saddr); + fl6->daddr = reverse ? hdr->saddr : hdr->daddr; + fl6->saddr = reverse ? hdr->daddr : hdr->saddr; while (nh + offset + 1 < skb->data || pskb_may_pull(skb, nh + offset + 1 - skb->data)) { diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index f2d72b8a3faa..3f2f7c4ab721 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -27,8 +27,8 @@ __xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) /* Initialize temporary selector matching only * to current session. */ - ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl6->daddr); - ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl6->saddr); + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; sel->dport = xfrm_flowi_dport(fl, &fl6->uli); sel->dport_mask = htons(0xffff); sel->sport = xfrm_flowi_sport(fl, &fl6->uli); diff --git a/net/irda/irttp.c b/net/irda/irttp.c index 32e3bb026110..5c93f2952b08 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -1461,14 +1461,12 @@ struct tsap_cb *irttp_dup(struct tsap_cb *orig, void *instance) } /* Allocate a new instance */ - new = kmalloc(sizeof(struct tsap_cb), GFP_ATOMIC); + new = kmemdup(orig, sizeof(struct tsap_cb), GFP_ATOMIC); if (!new) { IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __func__); spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); return NULL; } - /* Dup */ - memcpy(new, orig, sizeof(struct tsap_cb)); spin_lock_init(&new->lock); /* We don't need the old instance any more */ diff --git a/net/key/af_key.c b/net/key/af_key.c index 1e733e9073d0..bfc0bef170cb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -712,7 +712,7 @@ static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; - ipv6_addr_copy(&sin6->sin6_addr, (const struct in6_addr *)xaddr->a6); + sin6->sin6_addr = *(struct in6_addr *)xaddr->a6; sin6->sin6_scope_id = 0; return 128; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index bf8d50c67931..89ff8c67943e 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -756,9 +756,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, goto error; } - /* Point to L2TP header */ - optr = ptr = skb->data; - /* Trace packet contents, if enabled */ if (tunnel->debug & L2TP_MSG_DATA) { length = min(32u, skb->len); @@ -769,12 +766,15 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, offset = 0; do { - printk(" %02X", ptr[offset]); + printk(" %02X", skb->data[offset]); } while (++offset < length); printk("\n"); } + /* Point to L2TP header */ + optr = ptr = skb->data; + /* Get L2TP header flags */ hdrflags = ntohs(*(__be16 *) ptr); @@ -1072,7 +1072,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len /* Get routing info from the tunnel socket */ skb_dst_drop(skb); - skb_dst_set(skb, dst_clone(__sk_dst_get(sk))); + skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0))); inet = inet_sk(sk); fl = &inet->cork.fl; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8260b13d93c9..d5597b759ba3 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -201,7 +201,6 @@ config NF_CONNTRACK_BROADCAST config NF_CONNTRACK_NETBIOS_NS tristate "NetBIOS name service protocol support" - depends on NETFILTER_ADVANCED select NF_CONNTRACK_BROADCAST help NetBIOS name service requests are sent as broadcast messages from an @@ -542,7 +541,6 @@ config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' depends on IP_NF_RAW || IP6_NF_RAW depends on NF_CONNTRACK - depends on NETFILTER_ADVANCED help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT diff --git a/net/netfilter/core.c b/net/netfilter/core.c index afca6c78948c..4aa0f4b19bd8 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -54,6 +54,12 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo); struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); + +#if defined(CONFIG_JUMP_LABEL) +struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks_needed); +#endif + static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) @@ -70,6 +76,9 @@ int nf_register_hook(struct nf_hook_ops *reg) } list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#if defined(CONFIG_JUMP_LABEL) + jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif return 0; } EXPORT_SYMBOL(nf_register_hook); @@ -79,7 +88,9 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); - +#if defined(CONFIG_JUMP_LABEL) + jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif synchronize_net(); } EXPORT_SYMBOL(nf_unregister_hook); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 052579fe389a..b71a6e7ab0a5 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -116,9 +116,11 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, { int protoff; u8 nexthdr; + __be16 frag_off; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); if (protoff < 0) return false; diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index f2d576e6b769..4015fcaf87bc 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -241,7 +241,7 @@ hash_ip6_data_isnull(const struct hash_ip6_elem *elem) static inline void hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) { - ipv6_addr_copy(&dst->ip.in6, &src->ip.in6); + dst->ip.in6 = src->ip.in6; } static inline void diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 6ee10f5d59bd..37d667e3f6f8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -158,7 +158,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem data = { }; - u32 ip, ip_to, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to; u32 timeout = h->timeout; bool with_ports = false; int ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index fb90e344e907..e69e2718fbe1 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -162,7 +162,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem data = { }; - u32 ip, ip_to, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to; u32 timeout = h->timeout; bool with_ports = false; int ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index deb3e3dfa5fc..64199b4e93c9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -184,7 +184,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem data = { .cidr = HOST_MASK }; - u32 ip, ip_to, p = 0, port, port_to; + u32 ip, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to, ip2_last, ip2; u32 timeout = h->timeout; bool with_ports = false; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 60d016541c58..28988196775e 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -267,7 +267,7 @@ static inline void hash_net6_data_copy(struct hash_net6_elem *dst, const struct hash_net6_elem *src) { - ipv6_addr_copy(&dst->ip.in6, &src->ip.in6); + dst->ip.in6 = src->ip.in6; dst->cidr = src->cidr; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 093cc327020f..611c3359b94d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -983,7 +983,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, if (!cp) return NF_ACCEPT; - ipv6_addr_copy(&snet.in6, &iph->saddr); + snet.in6 = iph->saddr; return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, pp, offset, sizeof(struct ipv6hdr)); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3cdd479f9b5d..bcf5563e4837 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -603,9 +603,9 @@ sloop: #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) { p += sizeof(struct ip_vs_sync_v6); - ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6); - ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6); - ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; } else #endif { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index aa2d7206ee8a..38a576d05b4b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -235,7 +235,7 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, goto out_err; } } - ipv6_addr_copy(ret_saddr, &fl6.saddr); + *ret_saddr = fl6.saddr; return dst; out_err: @@ -279,7 +279,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) - ipv6_addr_copy(ret_saddr, &dest->dst_saddr.in6); + *ret_saddr = dest->dst_saddr.in6; spin_unlock(&dest->dst_lock); } else { dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); @@ -705,7 +705,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); + ipv6_hdr(skb)->daddr = cp->daddr.in6; if (!local || !skb->dev) { /* drop the old route when skb is not shared */ @@ -967,8 +967,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); - ipv6_addr_copy(&iph->saddr, &saddr); + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6b368be937c6..b62c4148b921 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,22 +27,17 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); - -struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly; -EXPORT_SYMBOL_GPL(nf_expect_event_cb); - /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); unsigned long events; struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); - notify = rcu_dereference(nf_conntrack_event_cb); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (notify == NULL) goto out_unlock; @@ -83,19 +78,20 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); -int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new) +int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *new) { int ret = 0; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(nf_conntrack_event_cb, + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); if (notify != NULL) { ret = -EBUSY; goto out_unlock; } - RCU_INIT_POINTER(nf_conntrack_event_cb, new); + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); return ret; @@ -105,32 +101,34 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new) +void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *new) { struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(nf_conntrack_event_cb, + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); BUG_ON(notify != new); - RCU_INIT_POINTER(nf_conntrack_event_cb, NULL); + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new) +int nf_ct_expect_register_notifier(struct net *net, + struct nf_exp_event_notifier *new) { int ret = 0; struct nf_exp_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(nf_expect_event_cb, + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); if (notify != NULL) { ret = -EBUSY; goto out_unlock; } - RCU_INIT_POINTER(nf_expect_event_cb, new); + RCU_INIT_POINTER(net->ct.nf_expect_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); return ret; @@ -140,15 +138,16 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); -void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new) +void nf_ct_expect_unregister_notifier(struct net *net, + struct nf_exp_event_notifier *new) { struct nf_exp_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(nf_expect_event_cb, + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); BUG_ON(notify != new); - RCU_INIT_POINTER(nf_expect_event_cb, NULL); + RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index f03c2d4539f6..f9368f33e7af 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -750,10 +750,10 @@ static int callforward_do_filter(const union nf_inet_addr *src, struct rt6_info *rt1, *rt2; memset(&fl1, 0, sizeof(fl1)); - ipv6_addr_copy(&fl1.daddr, &src->in6); + fl1.daddr = src->in6; memset(&fl2, 0, sizeof(fl2)); - ipv6_addr_copy(&fl2.daddr, &dst->in6); + fl2.daddr = dst->in6; if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e58aa9b1fe8a..ef21b221f036 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -4,7 +4,7 @@ * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> - * (C) 2005-2008 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2005-2011 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -2163,6 +2163,54 @@ MODULE_ALIAS("ip_conntrack_netlink"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); +static int __net_init ctnetlink_net_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int ret; + + ret = nf_conntrack_register_notifier(net, &ctnl_notifier); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register notifier.\n"); + goto err_out; + } + + ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); + if (ret < 0) { + pr_err("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +err_out: + return ret; +#endif +} + +static void ctnetlink_net_exit(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +#endif +} + +static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + ctnetlink_net_exit(net); +} + +static struct pernet_operations ctnetlink_net_ops = { + .init = ctnetlink_net_init, + .exit_batch = ctnetlink_net_exit_batch, +}; + static int __init ctnetlink_init(void) { int ret; @@ -2180,28 +2228,15 @@ static int __init ctnetlink_init(void) goto err_unreg_subsys; } -#ifdef CONFIG_NF_CONNTRACK_EVENTS - ret = nf_conntrack_register_notifier(&ctnl_notifier); - if (ret < 0) { - pr_err("ctnetlink_init: cannot register notifier.\n"); + if (register_pernet_subsys(&ctnetlink_net_ops)) { + pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } - ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp); - if (ret < 0) { - pr_err("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } -#endif - return 0; -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: - nf_conntrack_unregister_notifier(&ctnl_notifier); err_unreg_exp_subsys: nfnetlink_subsys_unregister(&ctnl_exp_subsys); -#endif err_unreg_subsys: nfnetlink_subsys_unregister(&ctnl_subsys); err_out: @@ -2213,11 +2248,7 @@ static void __exit ctnetlink_exit(void) pr_info("ctnetlink: unregistering from nfnetlink.\n"); nf_ct_remove_userspace_expectations(); -#ifdef CONFIG_NF_CONNTRACK_EVENTS - nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); - nf_conntrack_unregister_notifier(&ctnl_notifier); -#endif - + unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); } diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 4bca15a0c385..ba92824086f3 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -98,6 +98,7 @@ static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) struct ipv6hdr _ip6h; const struct ipv6hdr *ih; u8 nexthdr; + __be16 frag_off; int offset; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); @@ -108,7 +109,7 @@ static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) nexthdr = ih->nexthdr; offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), - &nexthdr); + &nexthdr, &frag_off); audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", &ih->saddr, &ih->daddr, nexthdr); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 9e63b43faeed..ba722621ed25 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -161,7 +161,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); - ipv6_addr_copy(&fl6->daddr, &ipv6_hdr(skb)->saddr); + fl6->daddr = ipv6_hdr(skb)->saddr; } rcu_read_lock(); ai = nf_get_afinfo(family); @@ -204,11 +204,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; + __be16 frag_off; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; - tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par->targinfo, diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 9dc9ecfdd546..3a295cc734bd 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -87,9 +87,10 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) struct ipv6hdr *ipv6h = ipv6_hdr(skb); int tcphoff; u_int8_t nexthdr; + __be16 frag_off; nexthdr = ipv6h->nexthdr; - tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index b77d383cec78..c047de2046ad 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -42,7 +42,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, int route_err; memset(&flow, 0, sizeof(flow)); - ipv6_addr_copy(&flow.daddr, addr); + flow.daddr = *addr; if (dev) flow.flowi6_oif = dev->ifindex; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index dfd52bad1523..068698f64791 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -445,6 +445,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, { __be16 _ports[2], *ports; u8 nexthdr; + __be16 frag_off; int poff; memset(dst, 0, sizeof(*dst)); @@ -480,7 +481,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if ((int)protoff < 0) return -1; break; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index fe39f7e913df..c302e30dc50c 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -214,6 +214,7 @@ extract_icmp6_fields(const struct sk_buff *skb, struct icmp6hdr *icmph, _icmph; __be16 *ports, _ports[2]; u8 inside_nexthdr; + __be16 inside_fragoff; int inside_hdrlen; icmph = skb_header_pointer(skb, outside_hdrlen, @@ -229,7 +230,8 @@ extract_icmp6_fields(const struct sk_buff *skb, return 1; inside_nexthdr = inside_iph->nexthdr; - inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr); + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + &inside_nexthdr, &inside_fragoff); if (inside_hdrlen < 0) return 1; /* hjm: Packet has no/incomplete transport layer headers. */ diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 9c24de10a657..5952237c0c86 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -111,8 +111,6 @@ int netlbl_cfg_unlbl_map_add(const char *domain, struct netlbl_domaddr_map *addrmap = NULL; struct netlbl_domaddr4_map *map4 = NULL; struct netlbl_domaddr6_map *map6 = NULL; - const struct in_addr *addr4, *mask4; - const struct in6_addr *addr6, *mask6; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) @@ -133,9 +131,9 @@ int netlbl_cfg_unlbl_map_add(const char *domain, INIT_LIST_HEAD(&addrmap->list6); switch (family) { - case AF_INET: - addr4 = addr; - mask4 = mask; + case AF_INET: { + const struct in_addr *addr4 = addr; + const struct in_addr *mask4 = mask; map4 = kzalloc(sizeof(*map4), GFP_ATOMIC); if (map4 == NULL) goto cfg_unlbl_map_add_failure; @@ -148,25 +146,29 @@ int netlbl_cfg_unlbl_map_add(const char *domain, if (ret_val != 0) goto cfg_unlbl_map_add_failure; break; - case AF_INET6: - addr6 = addr; - mask6 = mask; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: { + const struct in6_addr *addr6 = addr; + const struct in6_addr *mask6 = mask; map6 = kzalloc(sizeof(*map6), GFP_ATOMIC); if (map6 == NULL) goto cfg_unlbl_map_add_failure; map6->type = NETLBL_NLTYPE_UNLABELED; - ipv6_addr_copy(&map6->list.addr, addr6); + map6->list.addr = *addr6; map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0]; map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1]; map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2]; map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3]; - ipv6_addr_copy(&map6->list.mask, mask6); + map6->list.mask = *mask6; map6->list.valid = 1; - ret_val = netlbl_af4list_add(&map4->list, - &addrmap->list4); + ret_val = netlbl_af6list_add(&map6->list, + &addrmap->list6); if (ret_val != 0) goto cfg_unlbl_map_add_failure; break; + } +#endif /* IPv6 */ default: goto cfg_unlbl_map_add_failure; break; @@ -225,9 +227,11 @@ int netlbl_cfg_unlbl_static_add(struct net *net, case AF_INET: addr_len = sizeof(struct in_addr); break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: addr_len = sizeof(struct in6_addr); break; +#endif /* IPv6 */ default: return -EPFNOSUPPORT; } @@ -266,9 +270,11 @@ int netlbl_cfg_unlbl_static_del(struct net *net, case AF_INET: addr_len = sizeof(struct in_addr); break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: addr_len = sizeof(struct in6_addr); break; +#endif /* IPv6 */ default: return -EPFNOSUPPORT; } diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index bfa555869775..9879300beefd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -216,12 +216,12 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_failure; } - ipv6_addr_copy(&map->list.addr, addr); + map->list.addr = *addr; map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; map->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; map->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; - ipv6_addr_copy(&map->list.mask, mask); + map->list.mask = *mask; map->list.valid = 1; map->type = entry->type; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index e251c2c88521..049ccd2447d7 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -300,12 +300,12 @@ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, if (entry == NULL) return -ENOMEM; - ipv6_addr_copy(&entry->list.addr, addr); + entry->list.addr = *addr; entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; - ipv6_addr_copy(&entry->list.mask, mask); + entry->list.mask = *mask; entry->list.valid = 1; entry->secid = secid; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 482fa571b4ee..28453ae2a97b 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -33,6 +33,14 @@ void genl_unlock(void) } EXPORT_SYMBOL(genl_unlock); +#ifdef CONFIG_PROVE_LOCKING +int lockdep_genl_is_held(void) +{ + return lockdep_is_held(&genl_mutex); +} +EXPORT_SYMBOL(lockdep_genl_is_held); +#endif + #define GENL_FAM_TAB_SIZE 16 #define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) @@ -946,3 +954,16 @@ int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group, return genlmsg_mcast(skb, pid, group, flags); } EXPORT_SYMBOL(genlmsg_multicast_allns); + +void genl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *sk = net->genl_sock; + int report = 0; + + if (nlh) + report = nlmsg_report(nlh); + + nlmsg_notify(sk, skb, pid, group, report, flags); +} +EXPORT_SYMBOL(genl_notify); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 732152f718e0..c329b474eace 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1244,7 +1244,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: - if (!capable(CAP_NET_ADMIN)) return -EPERM; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return nr_rt_ioctl(cmd, argp); default: diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 915a87ba23e1..2cf330162d7e 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -670,14 +670,17 @@ int nr_rt_ioctl(unsigned int cmd, void __user *arg) case SIOCADDRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; - if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) + if (nr_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; - if (nr_route.ndigis < 0 || nr_route.ndigis > AX25_MAX_DIGIS) { - dev_put(dev); + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; - } switch (nr_route.type) { case NETROM_NODE: + if (strnlen(nr_route.mnemonic, 7) == 7) { + ret = -EINVAL; + break; + } + ret = nr_add_node(&nr_route.callsign, nr_route.mnemonic, &nr_route.neighbour, diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 000000000000..d9ea33c361be --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,28 @@ +# +# Open vSwitch +# + +config OPENVSWITCH + tristate "Open vSwitch" + ---help--- + Open vSwitch is a multilayer Ethernet switch targeted at virtualized + environments. In addition to supporting a variety of features + expected in a traditional hardware switch, it enables fine-grained + programmatic extension and flow-based control of the network. This + control is useful in a wide variety of applications but is + particularly important in multi-server virtualization deployments, + which are often characterized by highly dynamic endpoints and the + need to maintain logical abstractions for multiple tenants. + + The Open vSwitch datapath provides an in-kernel fast path for packet + forwarding. It is complemented by a userspace daemon, ovs-vswitchd, + which is able to accept configuration from a variety of sources and + translate it into packet processing rules. + + See http://openvswitch.org for more information and userspace + utilities. + + To compile this code as a module, choose M here: the module will be + called openvswitch. + + If unsure, say N. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 000000000000..15e7384745c1 --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ + actions.o \ + datapath.o \ + dp_notify.o \ + flow.o \ + vport.o \ + vport-internal_dev.o \ + vport-netdev.o \ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 000000000000..2725d1bdf291 --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/openvswitch.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in6.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <net/dsfield.h> + +#include "datapath.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr, int len, bool keep_skb); + +static int make_writable(struct sk_buff *skb, int write_len) +{ + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/* remove VLAN header from packet and update csum accrodingly. */ +static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +{ + struct vlan_hdr *vhdr; + int err; + + err = make_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, csum_partial(skb->data + + ETH_HLEN, VLAN_HLEN, 0)); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *current_tci = vhdr->h_vlan_TCI; + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + skb_reset_mac_len(skb); + + return 0; +} + +static int pop_vlan(struct sk_buff *skb) +{ + __be16 tci; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, ntohs(tci)); + return 0; +} + +static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +{ + if (unlikely(vlan_tx_tag_present(skb))) { + u16 current_tag; + + /* push down current VLAN tag */ + current_tag = vlan_tx_tag_get(skb); + + if (!__vlan_put_tag(skb, current_tag)) + return -ENOMEM; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(skb->data + + ETH_HLEN, VLAN_HLEN, 0)); + + } + __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + return 0; +} + +static int set_eth_addr(struct sk_buff *skb, + const struct ovs_key_ethernet *eth_key) +{ + int err; + err = make_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN); + memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN); + + return 0; +} + +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (nh->protocol == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, + *addr, new_addr, 1); + } else if (nh->protocol == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) + inet_proto_csum_replace4(&udp_hdr(skb)->check, skb, + *addr, new_addr, 1); + } + + csum_replace4(&nh->check, *addr, new_addr); + skb->rxhash = 0; + *addr = new_addr; +} + +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) +{ + csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); + nh->ttl = new_ttl; +} + +static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) +{ + struct iphdr *nh; + int err; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + + if (ipv4_key->ipv4_src != nh->saddr) + set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); + + if (ipv4_key->ipv4_dst != nh->daddr) + set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); + + if (ipv4_key->ipv4_tos != nh->tos) + ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); + + if (ipv4_key->ipv4_ttl != nh->ttl) + set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); + + return 0; +} + +/* Must follow make_writable() since that can move the skb data. */ +static void set_tp_port(struct sk_buff *skb, __be16 *port, + __be16 new_port, __sum16 *check) +{ + inet_proto_csum_replace2(check, skb, *port, new_port, 0); + *port = new_port; + skb->rxhash = 0; +} + +static int set_udp_port(struct sk_buff *skb, + const struct ovs_key_udp *udp_port_key) +{ + struct udphdr *uh; + int err; + + err = make_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); + if (unlikely(err)) + return err; + + uh = udp_hdr(skb); + if (udp_port_key->udp_src != uh->source) + set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check); + + if (udp_port_key->udp_dst != uh->dest) + set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check); + + return 0; +} + +static int set_tcp_port(struct sk_buff *skb, + const struct ovs_key_tcp *tcp_port_key) +{ + struct tcphdr *th; + int err; + + err = make_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); + if (unlikely(err)) + return err; + + th = tcp_hdr(skb); + if (tcp_port_key->tcp_src != th->source) + set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); + + if (tcp_port_key->tcp_dst != th->dest) + set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); + + return 0; +} + +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +{ + struct vport *vport; + + if (unlikely(!skb)) + return -ENOMEM; + + vport = rcu_dereference(dp->ports[out_port]); + if (unlikely(!vport)) { + kfree_skb(skb); + return -ENODEV; + } + + ovs_vport_send(vport, skb); + return 0; +} + +static int output_userspace(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + struct dp_upcall_info upcall; + const struct nlattr *a; + int rem; + + upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.key = &OVS_CB(skb)->flow->key; + upcall.userdata = NULL; + upcall.pid = 0; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_USERSPACE_ATTR_USERDATA: + upcall.userdata = a; + break; + + case OVS_USERSPACE_ATTR_PID: + upcall.pid = nla_get_u32(a); + break; + } + } + + return ovs_dp_upcall(dp, skb, &upcall); +} + +static int sample(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (net_random() >= nla_get_u32(a)) + return 0; + break; + + case OVS_SAMPLE_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + return do_execute_actions(dp, skb, nla_data(acts_list), + nla_len(acts_list), true); +} + +static int execute_set_action(struct sk_buff *skb, + const struct nlattr *nested_attr) +{ + int err = 0; + + switch (nla_type(nested_attr)) { + case OVS_KEY_ATTR_PRIORITY: + skb->priority = nla_get_u32(nested_attr); + break; + + case OVS_KEY_ATTR_ETHERNET: + err = set_eth_addr(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_IPV4: + err = set_ipv4(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_TCP: + err = set_tcp_port(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_UDP: + err = set_udp_port(skb, nla_data(nested_attr)); + break; + } + + return err; +} + +/* Execute a list of actions against 'skb'. */ +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr, int len, bool keep_skb) +{ + /* Every output action needs a separate clone of 'skb', but the common + * case is just a single output action, so that doing a clone and + * then freeing the original skbuff is wasteful. So the following code + * is slightly obscure just to avoid that. */ + int prev_port = -1; + const struct nlattr *a; + int rem; + + for (a = attr, rem = len; rem > 0; + a = nla_next(a, &rem)) { + int err = 0; + + if (prev_port != -1) { + do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + prev_port = -1; + } + + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: + prev_port = nla_get_u32(a); + break; + + case OVS_ACTION_ATTR_USERSPACE: + output_userspace(dp, skb, a); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + err = push_vlan(skb, nla_data(a)); + if (unlikely(err)) /* skb already freed. */ + return err; + break; + + case OVS_ACTION_ATTR_POP_VLAN: + err = pop_vlan(skb); + break; + + case OVS_ACTION_ATTR_SET: + err = execute_set_action(skb, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample(dp, skb, a); + break; + } + + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + } + + if (prev_port != -1) { + if (keep_skb) + skb = skb_clone(skb, GFP_ATOMIC); + + do_output(dp, skb, prev_port); + } else if (!keep_skb) + consume_skb(skb); + + return 0; +} + +/* Execute a list of actions against 'skb'. */ +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +{ + struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + return do_execute_actions(dp, skb, acts->actions, + acts->actions_len, false); +} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c new file mode 100644 index 000000000000..9a2725114e99 --- /dev/null +++ b/net/openvswitch/datapath.c @@ -0,0 +1,1912 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/time.h> +#include <linux/etherdevice.h> +#include <linux/genetlink.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/version.h> +#include <linux/ethtool.h> +#include <linux/wait.h> +#include <asm/system.h> +#include <asm/div64.h> +#include <linux/highmem.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv4.h> +#include <linux/inetdevice.h> +#include <linux/list.h> +#include <linux/openvswitch.h> +#include <linux/rculist.h> +#include <linux/dmi.h> +#include <linux/workqueue.h> +#include <net/genetlink.h> + +#include "datapath.h" +#include "flow.h" +#include "vport-internal_dev.h" + +/** + * DOC: Locking: + * + * Writes to device state (add/remove datapath, port, set operations on vports, + * etc.) are protected by RTNL. + * + * Writes to other state (flow table modifications, set miscellaneous datapath + * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside + * genl_mutex. + * + * Reads are protected by RCU. + * + * There are a few special cases (mostly stats) that have their own + * synchronization but they nest under all of above and don't interact with + * each other. + */ + +/* Global list of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +static LIST_HEAD(dps); + +#define REHASH_FLOW_INTERVAL (10 * 60 * HZ) +static void rehash_flow_table(struct work_struct *work); +static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); + +static struct vport *new_vport(const struct vport_parms *); +static int queue_gso_packets(int dp_ifindex, struct sk_buff *, + const struct dp_upcall_info *); +static int queue_userspace_packet(int dp_ifindex, struct sk_buff *, + const struct dp_upcall_info *); + +/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */ +static struct datapath *get_dp(int dp_ifindex) +{ + struct datapath *dp = NULL; + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(&init_net, dp_ifindex); + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + if (vport) + dp = vport->dp; + } + rcu_read_unlock(); + + return dp; +} + +/* Must be called with rcu_read_lock or RTNL lock. */ +const char *ovs_dp_name(const struct datapath *dp) +{ + struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]); + return vport->ops->get_name(vport); +} + +static int get_dpifindex(struct datapath *dp) +{ + struct vport *local; + int ifindex; + + rcu_read_lock(); + + local = rcu_dereference(dp->ports[OVSP_LOCAL]); + if (local) + ifindex = local->ops->get_ifindex(local); + else + ifindex = 0; + + rcu_read_unlock(); + + return ifindex; +} + +static void destroy_dp_rcu(struct rcu_head *rcu) +{ + struct datapath *dp = container_of(rcu, struct datapath, rcu); + + ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + free_percpu(dp->stats_percpu); + kfree(dp); +} + +/* Called with RTNL lock and genl_lock. */ +static struct vport *new_vport(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_add(parms); + if (!IS_ERR(vport)) { + struct datapath *dp = parms->dp; + + rcu_assign_pointer(dp->ports[parms->port_no], vport); + list_add(&vport->node, &dp->port_list); + } + + return vport; +} + +/* Called with RTNL lock. */ +void ovs_dp_detach_port(struct vport *p) +{ + ASSERT_RTNL(); + + /* First drop references to device. */ + list_del(&p->node); + rcu_assign_pointer(p->dp->ports[p->port_no], NULL); + + /* Then destroy it. */ + ovs_vport_del(p); +} + +/* Must be called with rcu_read_lock. */ +void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +{ + struct datapath *dp = p->dp; + struct sw_flow *flow; + struct dp_stats_percpu *stats; + struct sw_flow_key key; + u64 *stats_counter; + int error; + int key_len; + + stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id()); + + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_extract(skb, p->port_no, &key, &key_len); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + + /* Look up flow. */ + flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + if (unlikely(!flow)) { + struct dp_upcall_info upcall; + + upcall.cmd = OVS_PACKET_CMD_MISS; + upcall.key = &key; + upcall.userdata = NULL; + upcall.pid = p->upcall_pid; + ovs_dp_upcall(dp, skb, &upcall); + consume_skb(skb); + stats_counter = &stats->n_missed; + goto out; + } + + OVS_CB(skb)->flow = flow; + + stats_counter = &stats->n_hit; + ovs_flow_used(OVS_CB(skb)->flow, skb); + ovs_execute_actions(dp, skb); + +out: + /* Update datapath statistics. */ + u64_stats_update_begin(&stats->sync); + (*stats_counter)++; + u64_stats_update_end(&stats->sync); +} + +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX +}; + +int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + struct dp_stats_percpu *stats; + int dp_ifindex; + int err; + + if (upcall_info->pid == 0) { + err = -ENOTCONN; + goto err; + } + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) { + err = -ENODEV; + goto err; + } + + if (!skb_is_gso(skb)) + err = queue_userspace_packet(dp_ifindex, skb, upcall_info); + else + err = queue_gso_packets(dp_ifindex, skb, upcall_info); + if (err) + goto err; + + return 0; + +err: + stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id()); + + u64_stats_update_begin(&stats->sync); + stats->n_lost++; + u64_stats_update_end(&stats->sync); + + return err; +} + +static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + struct dp_upcall_info later_info; + struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + int err; + + segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Queue all of the segments. */ + skb = segs; + do { + err = queue_userspace_packet(dp_ifindex, skb, upcall_info); + if (err) + break; + + if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *upcall_info->key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + + later_info = *upcall_info; + later_info.key = &later_key; + upcall_info = &later_info; + } + } while ((skb = skb->next)); + + /* Free all of the segments. */ + skb = segs; + do { + nskb = skb->next; + if (err) + kfree_skb(skb); + else + consume_skb(skb); + } while ((skb = nskb)); + return err; +} + +static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + struct ovs_header *upcall; + struct sk_buff *nskb = NULL; + struct sk_buff *user_skb; /* to be queued to userspace */ + struct nlattr *nla; + unsigned int len; + int err; + + if (vlan_tx_tag_present(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb)); + if (!skb) + return -ENOMEM; + + nskb->vlan_tci = 0; + skb = nskb; + } + + if (nla_attr_size(skb->len) > USHRT_MAX) { + err = -EFBIG; + goto out; + } + + len = sizeof(struct ovs_header); + len += nla_total_size(skb->len); + len += nla_total_size(FLOW_BUFSIZE); + if (upcall_info->cmd == OVS_PACKET_CMD_ACTION) + len += nla_total_size(8); + + user_skb = genlmsg_new(len, GFP_ATOMIC); + if (!user_skb) { + err = -ENOMEM; + goto out; + } + + upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, + 0, upcall_info->cmd); + upcall->dp_ifindex = dp_ifindex; + + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); + ovs_flow_to_nlattrs(upcall_info->key, user_skb); + nla_nest_end(user_skb, nla); + + if (upcall_info->userdata) + nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA, + nla_get_u64(upcall_info->userdata)); + + nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); + + skb_copy_and_csum_dev(skb, nla_data(nla)); + + err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid); + +out: + kfree_skb(nskb); + return err; +} + +/* Called with genl_mutex. */ +static int flush_flows(int dp_ifindex) +{ + struct flow_table *old_table; + struct flow_table *new_table; + struct datapath *dp; + + dp = get_dp(dp_ifindex); + if (!dp) + return -ENODEV; + + old_table = genl_dereference(dp->table); + new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); + if (!new_table) + return -ENOMEM; + + rcu_assign_pointer(dp->table, new_table); + + ovs_flow_tbl_deferred_destroy(old_table); + return 0; +} + +static int validate_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth); + +static int validate_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + return validate_actions(actions, key, depth + 1); +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + if (key_type > OVS_KEY_ATTR_MAX || + nla_len(ovs_key) != ovs_key_lens[key_type]) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_IPV4: + if (flow_key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + + if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if (flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_UDP: + if (flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst) + return -EINVAL; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int validate_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + + case OVS_ACTION_ATTR_POP_VLAN: + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_sample(a, key, depth); + if (err) + return err; + break; + + default: + return -EINVAL; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +static void clear_stats(struct sw_flow *flow) +{ + flow->used = 0; + flow->tcp_flags = 0; + flow->packet_count = 0; + flow->byte_count = 0; +} + +static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = info->userhdr; + struct nlattr **a = info->attrs; + struct sw_flow_actions *acts; + struct sk_buff *packet; + struct sw_flow *flow; + struct datapath *dp; + struct ethhdr *eth; + int len; + int err; + int key_len; + + err = -EINVAL; + if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || + !a[OVS_PACKET_ATTR_ACTIONS] || + nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN) + goto err; + + len = nla_len(a[OVS_PACKET_ATTR_PACKET]); + packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); + err = -ENOMEM; + if (!packet) + goto err; + skb_reserve(packet, NET_IP_ALIGN); + + memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len); + + skb_reset_mac_header(packet); + eth = eth_hdr(packet); + + /* Normally, setting the skb 'protocol' field would be handled by a + * call to eth_type_trans(), but it assumes there's a sending + * device, which we may not have. */ + if (ntohs(eth->h_proto) >= 1536) + packet->protocol = eth->h_proto; + else + packet->protocol = htons(ETH_P_802_2); + + /* Build an sw_flow for sending this packet. */ + flow = ovs_flow_alloc(); + err = PTR_ERR(flow); + if (IS_ERR(flow)) + goto err_kfree_skb; + + err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + if (err) + goto err_flow_free; + + err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority, + &flow->key.phy.in_port, + a[OVS_PACKET_ATTR_KEY]); + if (err) + goto err_flow_free; + + err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0); + if (err) + goto err_flow_free; + + flow->hash = ovs_flow_hash(&flow->key, key_len); + + acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]); + err = PTR_ERR(acts); + if (IS_ERR(acts)) + goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->flow = flow; + packet->priority = flow->key.phy.priority; + + rcu_read_lock(); + dp = get_dp(ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto err_unlock; + + local_bh_disable(); + err = ovs_execute_actions(dp, packet); + local_bh_enable(); + rcu_read_unlock(); + + ovs_flow_free(flow); + return err; + +err_unlock: + rcu_read_unlock(); +err_flow_free: + ovs_flow_free(flow); +err_kfree_skb: + kfree_skb(packet); +err: + return err; +} + +static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { + [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC }, + [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_ops dp_packet_genl_ops[] = { + { .cmd = OVS_PACKET_CMD_EXECUTE, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = packet_policy, + .doit = ovs_packet_cmd_execute + } +}; + +static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) +{ + int i; + struct flow_table *table = genl_dereference(dp->table); + + stats->n_flows = ovs_flow_tbl_count(table); + + stats->n_hit = stats->n_missed = stats->n_lost = 0; + for_each_possible_cpu(i) { + const struct dp_stats_percpu *percpu_stats; + struct dp_stats_percpu local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dp->stats_percpu, i); + + do { + start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + + stats->n_hit += local_stats.n_hit; + stats->n_missed += local_stats.n_missed; + stats->n_lost += local_stats.n_lost; + } +} + +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, +}; + +static struct genl_family dp_flow_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX +}; + +static struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP +}; + +/* Called with genl_lock. */ +static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, + struct sk_buff *skb, u32 pid, + u32 seq, u32 flags, u8 cmd) +{ + const int skb_orig_len = skb->len; + const struct sw_flow_actions *sf_acts; + struct ovs_flow_stats stats; + struct ovs_header *ovs_header; + struct nlattr *nla; + unsigned long used; + u8 tcp_flags; + int err; + + sf_acts = rcu_dereference_protected(flow->sf_acts, + lockdep_genl_is_held()); + + ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); + if (!nla) + goto nla_put_failure; + err = ovs_flow_to_nlattrs(&flow->key, skb); + if (err) + goto error; + nla_nest_end(skb, nla); + + spin_lock_bh(&flow->lock); + used = flow->used; + stats.n_packets = flow->packet_count; + stats.n_bytes = flow->byte_count; + tcp_flags = flow->tcp_flags; + spin_unlock_bh(&flow->lock); + + if (used) + NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)); + + if (stats.n_packets) + NLA_PUT(skb, OVS_FLOW_ATTR_STATS, + sizeof(struct ovs_flow_stats), &stats); + + if (tcp_flags) + NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags); + + /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if + * this is the first flow to be dumped into 'skb'. This is unusual for + * Netlink but individual action lists can be longer than + * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. + * The userspace caller can always fetch the actions separately if it + * really wants them. (Most userspace callers in fact don't care.) + * + * This can only fail for dump operations because the skb is always + * properly sized for single flows. + */ + err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len, + sf_acts->actions); + if (err < 0 && skb_orig_len) + goto error; + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) +{ + const struct sw_flow_actions *sf_acts; + int len; + + sf_acts = rcu_dereference_protected(flow->sf_acts, + lockdep_genl_is_held()); + + /* OVS_FLOW_ATTR_KEY */ + len = nla_total_size(FLOW_BUFSIZE); + /* OVS_FLOW_ATTR_ACTIONS */ + len += nla_total_size(sf_acts->actions_len); + /* OVS_FLOW_ATTR_STATS */ + len += nla_total_size(sizeof(struct ovs_flow_stats)); + /* OVS_FLOW_ATTR_TCP_FLAGS */ + len += nla_total_size(1); + /* OVS_FLOW_ATTR_USED */ + len += nla_total_size(8); + + len += NLMSG_ALIGN(sizeof(struct ovs_header)); + + return genlmsg_new(len, GFP_KERNEL); +} + +static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, + struct datapath *dp, + u32 pid, u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = ovs_flow_cmd_alloc_info(flow); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd); + BUG_ON(retval < 0); + return skb; +} + +static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sw_flow *flow; + struct sk_buff *reply; + struct datapath *dp; + struct flow_table *table; + int error; + int key_len; + + /* Extract key. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) + goto error; + error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (error) + goto error; + + /* Validate actions. */ + if (a[OVS_FLOW_ATTR_ACTIONS]) { + error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0); + if (error) + goto error; + } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { + error = -EINVAL; + goto error; + } + + dp = get_dp(ovs_header->dp_ifindex); + error = -ENODEV; + if (!dp) + goto error; + + table = genl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) { + struct sw_flow_actions *acts; + + /* Bail out if we're not allowed to create a new flow. */ + error = -ENOENT; + if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) + goto error; + + /* Expand table, if necessary, to make room. */ + if (ovs_flow_tbl_need_to_expand(table)) { + struct flow_table *new_table; + + new_table = ovs_flow_tbl_expand(table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->table, new_table); + ovs_flow_tbl_deferred_destroy(table); + table = genl_dereference(dp->table); + } + } + + /* Allocate flow. */ + flow = ovs_flow_alloc(); + if (IS_ERR(flow)) { + error = PTR_ERR(flow); + goto error; + } + flow->key = key; + clear_stats(flow); + + /* Obtain actions. */ + acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto error_free_flow; + rcu_assign_pointer(flow->sf_acts, acts); + + /* Put flow in bucket. */ + flow->hash = ovs_flow_hash(&key, key_len); + ovs_flow_tbl_insert(table, flow); + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, + info->snd_seq, + OVS_FLOW_CMD_NEW); + } else { + /* We found a matching flow. */ + struct sw_flow_actions *old_acts; + struct nlattr *acts_attrs; + + /* Bail out if we're not allowed to modify an existing flow. + * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL + * because Generic Netlink treats the latter as a dump + * request. We also accept NLM_F_EXCL in case that bug ever + * gets fixed. + */ + error = -EEXIST; + if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && + info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) + goto error; + + /* Update actions. */ + old_acts = rcu_dereference_protected(flow->sf_acts, + lockdep_genl_is_held()); + acts_attrs = a[OVS_FLOW_ATTR_ACTIONS]; + if (acts_attrs && + (old_acts->actions_len != nla_len(acts_attrs) || + memcmp(old_acts->actions, nla_data(acts_attrs), + old_acts->actions_len))) { + struct sw_flow_actions *new_acts; + + new_acts = ovs_flow_actions_alloc(acts_attrs); + error = PTR_ERR(new_acts); + if (IS_ERR(new_acts)) + goto error; + + rcu_assign_pointer(flow->sf_acts, new_acts); + ovs_flow_deferred_free_acts(old_acts); + } + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, + info->snd_seq, OVS_FLOW_CMD_NEW); + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) { + spin_lock_bh(&flow->lock); + clear_stats(flow); + spin_unlock_bh(&flow->lock); + } + } + + if (!IS_ERR(reply)) + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_flow_multicast_group.id, info->nlhdr, + GFP_KERNEL); + else + netlink_set_err(init_net.genl_sock, 0, + ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); + return 0; + +error_free_flow: + ovs_flow_free(flow); +error: + return error; +} + +static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct flow_table *table; + int err; + int key_len; + + if (!a[OVS_FLOW_ATTR_KEY]) + return -EINVAL; + err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (err) + return err; + + dp = get_dp(ovs_header->dp_ifindex); + if (!dp) + return -ENODEV; + + table = genl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) + return -ENOENT; + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, + info->snd_seq, OVS_FLOW_CMD_NEW); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + return genlmsg_reply(reply, info); +} + +static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct flow_table *table; + int err; + int key_len; + + if (!a[OVS_FLOW_ATTR_KEY]) + return flush_flows(ovs_header->dp_ifindex); + err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (err) + return err; + + dp = get_dp(ovs_header->dp_ifindex); + if (!dp) + return -ENODEV; + + table = genl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) + return -ENOENT; + + reply = ovs_flow_cmd_alloc_info(flow); + if (!reply) + return -ENOMEM; + + ovs_flow_tbl_remove(table, flow); + + err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid, + info->snd_seq, 0, OVS_FLOW_CMD_DEL); + BUG_ON(err < 0); + + ovs_flow_deferred_free(flow); + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL); + return 0; +} + +static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + struct flow_table *table; + + dp = get_dp(ovs_header->dp_ifindex); + if (!dp) + return -ENODEV; + + table = genl_dereference(dp->table); + + for (;;) { + struct sw_flow *flow; + u32 bucket, obj; + + bucket = cb->args[0]; + obj = cb->args[1]; + flow = ovs_flow_tbl_next(table, &bucket, &obj); + if (!flow) + break; + + if (ovs_flow_cmd_fill_info(flow, dp, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_FLOW_CMD_NEW) < 0) + break; + + cb->args[0] = bucket; + cb->args[1] = obj; + } + return skb->len; +} + +static struct genl_ops dp_flow_genl_ops[] = { + { .cmd = OVS_FLOW_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new_or_set + }, + { .cmd = OVS_FLOW_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_del + }, + { .cmd = OVS_FLOW_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_get, + .dumpit = ovs_flow_cmd_dump + }, + { .cmd = OVS_FLOW_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new_or_set, + }, +}; + +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, +}; + +static struct genl_family dp_datapath_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX +}; + +static struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP +}; + +static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, + u32 pid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_dp_stats dp_stats; + int err; + + ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family, + flags, cmd); + if (!ovs_header) + goto error; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + rcu_read_lock(); + err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + + get_dp_stats(dp, &dp_stats); + NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats); + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + genlmsg_cancel(skb, ovs_header); +error: + return -EMSGSIZE; +} + +static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd); + if (retval < 0) { + kfree_skb(skb); + return ERR_PTR(retval); + } + return skb; +} + +/* Called with genl_mutex and optionally with RTNL lock also. */ +static struct datapath *lookup_datapath(struct ovs_header *ovs_header, + struct nlattr *a[OVS_DP_ATTR_MAX + 1]) +{ + struct datapath *dp; + + if (!a[OVS_DP_ATTR_NAME]) + dp = get_dp(ovs_header->dp_ifindex); + else { + struct vport *vport; + + rcu_read_lock(); + vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME])); + dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; + rcu_read_unlock(); + } + return dp ? dp : ERR_PTR(-ENODEV); +} + +static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct vport_parms parms; + struct sk_buff *reply; + struct datapath *dp; + struct vport *vport; + int err; + + err = -EINVAL; + if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) + goto err; + + rtnl_lock(); + err = -ENODEV; + if (!try_module_get(THIS_MODULE)) + goto err_unlock_rtnl; + + err = -ENOMEM; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (dp == NULL) + goto err_put_module; + INIT_LIST_HEAD(&dp->port_list); + + /* Allocate table. */ + err = -ENOMEM; + rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS)); + if (!dp->table) + goto err_free_dp; + + dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); + if (!dp->stats_percpu) { + err = -ENOMEM; + goto err_destroy_table; + } + + /* Set up our datapath device. */ + parms.name = nla_data(a[OVS_DP_ATTR_NAME]); + parms.type = OVS_VPORT_TYPE_INTERNAL; + parms.options = NULL; + parms.dp = dp; + parms.port_no = OVSP_LOCAL; + parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + + vport = new_vport(&parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + if (err == -EBUSY) + err = -EEXIST; + + goto err_destroy_percpu; + } + + reply = ovs_dp_cmd_build_info(dp, info->snd_pid, + info->snd_seq, OVS_DP_CMD_NEW); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto err_destroy_local_port; + + list_add_tail(&dp->list_node, &dps); + rtnl_unlock(); + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_datapath_multicast_group.id, info->nlhdr, + GFP_KERNEL); + return 0; + +err_destroy_local_port: + ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL])); +err_destroy_percpu: + free_percpu(dp->stats_percpu); +err_destroy_table: + ovs_flow_tbl_destroy(genl_dereference(dp->table)); +err_free_dp: + kfree(dp); +err_put_module: + module_put(THIS_MODULE); +err_unlock_rtnl: + rtnl_unlock(); +err: + return err; +} + +static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct vport *vport, *next_vport; + struct sk_buff *reply; + struct datapath *dp; + int err; + + rtnl_lock(); + dp = lookup_datapath(info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto exit_unlock; + + reply = ovs_dp_cmd_build_info(dp, info->snd_pid, + info->snd_seq, OVS_DP_CMD_DEL); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto exit_unlock; + + list_for_each_entry_safe(vport, next_vport, &dp->port_list, node) + if (vport->port_no != OVSP_LOCAL) + ovs_dp_detach_port(vport); + + list_del(&dp->list_node); + ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL])); + + /* rtnl_unlock() will wait until all the references to devices that + * are pending unregistration have been dropped. We do it here to + * ensure that any internal devices (which contain DP pointers) are + * fully destroyed before freeing the datapath. + */ + rtnl_unlock(); + + call_rcu(&dp->rcu, destroy_dp_rcu); + module_put(THIS_MODULE); + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_datapath_multicast_group.id, info->nlhdr, + GFP_KERNEL); + + return 0; + +exit_unlock: + rtnl_unlock(); + return err; +} + +static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + dp = lookup_datapath(info->userhdr, info->attrs); + if (IS_ERR(dp)) + return PTR_ERR(dp); + + reply = ovs_dp_cmd_build_info(dp, info->snd_pid, + info->snd_seq, OVS_DP_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + netlink_set_err(init_net.genl_sock, 0, + ovs_dp_datapath_multicast_group.id, err); + return 0; + } + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_datapath_multicast_group.id, info->nlhdr, + GFP_KERNEL); + + return 0; +} + +static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + + dp = lookup_datapath(info->userhdr, info->attrs); + if (IS_ERR(dp)) + return PTR_ERR(dp); + + reply = ovs_dp_cmd_build_info(dp, info->snd_pid, + info->snd_seq, OVS_DP_CMD_NEW); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + return genlmsg_reply(reply, info); +} + +static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct datapath *dp; + int skip = cb->args[0]; + int i = 0; + + list_for_each_entry(dp, &dps, list_node) { + if (i < skip) + continue; + if (ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_DP_CMD_NEW) < 0) + break; + i++; + } + + cb->args[0] = i; + + return skb->len; +} + +static struct genl_ops dp_datapath_genl_ops[] = { + { .cmd = OVS_DP_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_new + }, + { .cmd = OVS_DP_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_del + }, + { .cmd = OVS_DP_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_get, + .dumpit = ovs_dp_cmd_dump + }, + { .cmd = OVS_DP_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_set, + }, +}; + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX +}; + +struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP +}; + +/* Called with RTNL lock or RCU read lock. */ +static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + u32 pid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_vport_stats vport_stats; + int err; + + ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(vport->dp); + + NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no); + NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type); + NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)); + NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid); + + ovs_vport_get_stats(vport, &vport_stats); + NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), + &vport_stats); + + err = ovs_vport_get_options(vport, skb); + if (err == -EMSGSIZE) + goto error; + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +/* Called with RTNL lock or RCU read lock. */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd); + if (retval < 0) { + kfree_skb(skb); + return ERR_PTR(retval); + } + return skb; +} + +/* Called with RTNL lock or RCU read lock. */ +static struct vport *lookup_vport(struct ovs_header *ovs_header, + struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) +{ + struct datapath *dp; + struct vport *vport; + + if (a[OVS_VPORT_ATTR_NAME]) { + vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME])); + if (!vport) + return ERR_PTR(-ENODEV); + return vport; + } else if (a[OVS_VPORT_ATTR_PORT_NO]) { + u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + if (port_no >= DP_MAX_PORTS) + return ERR_PTR(-EFBIG); + + dp = get_dp(ovs_header->dp_ifindex); + if (!dp) + return ERR_PTR(-ENODEV); + + vport = rcu_dereference_rtnl(dp->ports[port_no]); + if (!vport) + return ERR_PTR(-ENOENT); + return vport; + } else + return ERR_PTR(-EINVAL); +} + +static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct vport_parms parms; + struct sk_buff *reply; + struct vport *vport; + struct datapath *dp; + u32 port_no; + int err; + + err = -EINVAL; + if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || + !a[OVS_VPORT_ATTR_UPCALL_PID]) + goto exit; + + rtnl_lock(); + dp = get_dp(ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto exit_unlock; + + if (a[OVS_VPORT_ATTR_PORT_NO]) { + port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + err = -EFBIG; + if (port_no >= DP_MAX_PORTS) + goto exit_unlock; + + vport = rtnl_dereference(dp->ports[port_no]); + err = -EBUSY; + if (vport) + goto exit_unlock; + } else { + for (port_no = 1; ; port_no++) { + if (port_no >= DP_MAX_PORTS) { + err = -EFBIG; + goto exit_unlock; + } + vport = rtnl_dereference(dp->ports[port_no]); + if (!vport) + break; + } + } + + parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + parms.options = a[OVS_VPORT_ATTR_OPTIONS]; + parms.dp = dp; + parms.port_no = port_no; + parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + vport = new_vport(&parms); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq, + OVS_VPORT_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + ovs_dp_detach_port(vport); + goto exit_unlock; + } + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + +exit_unlock: + rtnl_unlock(); +exit: + return err; +} + +static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + rtnl_lock(); + vport = lookup_vport(info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + err = 0; + if (a[OVS_VPORT_ATTR_TYPE] && + nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) + err = -EINVAL; + + if (!err && a[OVS_VPORT_ATTR_OPTIONS]) + err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); + if (!err && a[OVS_VPORT_ATTR_UPCALL_PID]) + vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq, + OVS_VPORT_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + netlink_set_err(init_net.genl_sock, 0, + ovs_dp_vport_multicast_group.id, err); + return 0; + } + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + +exit_unlock: + rtnl_unlock(); + return err; +} + +static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + rtnl_lock(); + vport = lookup_vport(info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + if (vport->port_no == OVSP_LOCAL) { + err = -EINVAL; + goto exit_unlock; + } + + reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq, + OVS_VPORT_CMD_DEL); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto exit_unlock; + + ovs_dp_detach_port(vport); + + genl_notify(reply, genl_info_net(info), info->snd_pid, + ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL); + +exit_unlock: + rtnl_unlock(); + return err; +} + +static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sk_buff *reply; + struct vport *vport; + int err; + + rcu_read_lock(); + vport = lookup_vport(ovs_header, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq, + OVS_VPORT_CMD_NEW); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto exit_unlock; + + rcu_read_unlock(); + + return genlmsg_reply(reply, info); + +exit_unlock: + rcu_read_unlock(); + return err; +} + +static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + u32 port_no; + int retval; + + dp = get_dp(ovs_header->dp_ifindex); + if (!dp) + return -ENODEV; + + rcu_read_lock(); + for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) { + struct vport *vport; + + vport = rcu_dereference(dp->ports[port_no]); + if (!vport) + continue; + + if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_VPORT_CMD_NEW) < 0) + break; + } + rcu_read_unlock(); + + cb->args[0] = port_no; + retval = skb->len; + + return retval; +} + +static void rehash_flow_table(struct work_struct *work) +{ + struct datapath *dp; + + genl_lock(); + + list_for_each_entry(dp, &dps, list_node) { + struct flow_table *old_table = genl_dereference(dp->table); + struct flow_table *new_table; + + new_table = ovs_flow_tbl_rehash(old_table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->table, new_table); + ovs_flow_tbl_deferred_destroy(old_table); + } + } + + genl_unlock(); + + schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); +} + +static struct genl_ops dp_vport_genl_ops[] = { + { .cmd = OVS_VPORT_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_new + }, + { .cmd = OVS_VPORT_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_del + }, + { .cmd = OVS_VPORT_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_get, + .dumpit = ovs_vport_cmd_dump + }, + { .cmd = OVS_VPORT_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_set, + }, +}; + +struct genl_family_and_ops { + struct genl_family *family; + struct genl_ops *ops; + int n_ops; + struct genl_multicast_group *group; +}; + +static const struct genl_family_and_ops dp_genl_families[] = { + { &dp_datapath_genl_family, + dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), + &ovs_dp_datapath_multicast_group }, + { &dp_vport_genl_family, + dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), + &ovs_dp_vport_multicast_group }, + { &dp_flow_genl_family, + dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), + &ovs_dp_flow_multicast_group }, + { &dp_packet_genl_family, + dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), + NULL }, +}; + +static void dp_unregister_genl(int n_families) +{ + int i; + + for (i = 0; i < n_families; i++) + genl_unregister_family(dp_genl_families[i].family); +} + +static int dp_register_genl(void) +{ + int n_registered; + int err; + int i; + + n_registered = 0; + for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { + const struct genl_family_and_ops *f = &dp_genl_families[i]; + + err = genl_register_family_with_ops(f->family, f->ops, + f->n_ops); + if (err) + goto error; + n_registered++; + + if (f->group) { + err = genl_register_mc_group(f->family, f->group); + if (err) + goto error; + } + } + + return 0; + +error: + dp_unregister_genl(n_registered); + return err; +} + +static int __init dp_init(void) +{ + struct sk_buff *dummy_skb; + int err; + + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb)); + + pr_info("Open vSwitch switching datapath\n"); + + err = ovs_flow_init(); + if (err) + goto error; + + err = ovs_vport_init(); + if (err) + goto error_flow_exit; + + err = register_netdevice_notifier(&ovs_dp_device_notifier); + if (err) + goto error_vport_exit; + + err = dp_register_genl(); + if (err < 0) + goto error_unreg_notifier; + + schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); + + return 0; + +error_unreg_notifier: + unregister_netdevice_notifier(&ovs_dp_device_notifier); +error_vport_exit: + ovs_vport_exit(); +error_flow_exit: + ovs_flow_exit(); +error: + return err; +} + +static void dp_cleanup(void) +{ + cancel_delayed_work_sync(&rehash_flow_wq); + rcu_barrier(); + dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + unregister_netdevice_notifier(&ovs_dp_device_notifier); + ovs_vport_exit(); + ovs_flow_exit(); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h new file mode 100644 index 000000000000..5b9f884b7055 --- /dev/null +++ b/net/openvswitch/datapath.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include <asm/page.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/u64_stats_sync.h> +#include <linux/version.h> + +#include "flow.h" + +struct vport; + +#define DP_MAX_PORTS 1024 +#define SAMPLE_ACTION_DEPTH 3 + +/** + * struct dp_stats_percpu - per-cpu packet processing statistics for a given + * datapath. + * @n_hit: Number of received packets for which a matching flow was found in + * the flow table. + * @n_miss: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_miss is the number of packets that have + * been received by the datapath. + * @n_lost: Number of received packets that had no matching flow in the flow + * table that could not be sent to userspace (normally due to an overflow in + * one of the datapath's queues). + */ +struct dp_stats_percpu { + u64 n_hit; + u64 n_missed; + u64 n_lost; + struct u64_stats_sync sync; +}; + +/** + * struct datapath - datapath for flow-based packet switching + * @rcu: RCU callback head for deferred destruction. + * @list_node: Element in global 'dps' list. + * @n_flows: Number of flows currently in flow table. + * @table: Current flow table. Protected by genl_lock and RCU. + * @ports: Map from port number to &struct vport. %OVSP_LOCAL port + * always exists, other ports may be %NULL. Protected by RTNL and RCU. + * @port_list: List of all ports in @ports in arbitrary order. RTNL required + * to iterate or modify. + * @stats_percpu: Per-CPU datapath statistics. + * + * Context: See the comment on locking at the top of datapath.c for additional + * locking information. + */ +struct datapath { + struct rcu_head rcu; + struct list_head list_node; + + /* Flow table. */ + struct flow_table __rcu *table; + + /* Switch ports. */ + struct vport __rcu *ports[DP_MAX_PORTS]; + struct list_head port_list; + + /* Stats. */ + struct dp_stats_percpu __percpu *stats_percpu; +}; + +/** + * struct ovs_skb_cb - OVS data in skb CB + * @flow: The flow associated with this packet. May be %NULL if no flow. + */ +struct ovs_skb_cb { + struct sw_flow *flow; +}; +#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) + +/** + * struct dp_upcall - metadata to include with a packet to send to userspace + * @cmd: One of %OVS_PACKET_CMD_*. + * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull. + * @userdata: If nonnull, its u64 value is extracted and passed to userspace as + * %OVS_PACKET_ATTR_USERDATA. + * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no + * packet is sent and the packet is accounted in the datapath's @n_lost + * counter. + */ +struct dp_upcall_info { + u8 cmd; + const struct sw_flow_key *key; + const struct nlattr *userdata; + u32 pid; +}; + +extern struct notifier_block ovs_dp_device_notifier; +extern struct genl_multicast_group ovs_dp_vport_multicast_group; + +void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_detach_port(struct vport *); +int ovs_dp_upcall(struct datapath *, struct sk_buff *, + const struct dp_upcall_info *); + +const char *ovs_dp_name(const struct datapath *dp); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, + u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +#endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c new file mode 100644 index 000000000000..46736518c453 --- /dev/null +++ b/net/openvswitch/dp_notify.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/netdevice.h> +#include <net/genetlink.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static int dp_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct vport *vport; + + if (ovs_is_internal_dev(dev)) + vport = ovs_internal_dev_get_vport(dev); + else + vport = ovs_netdev_get_vport(dev); + + if (!vport) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UNREGISTER: + if (!ovs_is_internal_dev(dev)) { + struct sk_buff *notify; + + notify = ovs_vport_cmd_build_info(vport, 0, 0, + OVS_VPORT_CMD_DEL); + ovs_dp_detach_port(vport); + if (IS_ERR(notify)) { + netlink_set_err(init_net.genl_sock, 0, + ovs_dp_vport_multicast_group.id, + PTR_ERR(notify)); + break; + } + + genlmsg_multicast(notify, 0, ovs_dp_vport_multicast_group.id, + GFP_KERNEL); + } + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block ovs_dp_device_notifier = { + .notifier_call = dp_device_event +}; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c new file mode 100644 index 000000000000..fe7f020a843e --- /dev/null +++ b/net/openvswitch/flow.c @@ -0,0 +1,1346 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +static struct kmem_cache *flow_cache; + +static int check_header(struct sk_buff *skb, int len) +{ + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + return 0; +} + +static bool arphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_network_offset(skb) + + sizeof(struct arp_eth_header)); +} + +static int check_iphdr(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int ip_len; + int err; + + err = check_header(skb, nh_ofs + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ip_len = ip_hdrlen(skb); + if (unlikely(ip_len < sizeof(struct iphdr) || + skb->len < nh_ofs + ip_len)) + return -EINVAL; + + skb_set_transport_header(skb, nh_ofs + ip_len); + return 0; +} + +static bool tcphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + int tcp_len; + + if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) + return false; + + tcp_len = tcp_hdrlen(skb); + if (unlikely(tcp_len < sizeof(struct tcphdr) || + skb->len < th_ofs + tcp_len)) + return false; + + return true; +} + +static bool udphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); +} + +static bool icmphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmphdr)); +} + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ + struct timespec cur_ts; + u64 cur_ms, idle_ms; + + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; + + return cur_ms - idle_ms; +} + +#define SW_FLOW_KEY_OFFSET(field) \ + (offsetof(struct sw_flow_key, field) + \ + FIELD_SIZEOF(struct sw_flow_key, field)) + +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, + int *key_lenp) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int nh_len; + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + int err; + + *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); + + err = check_header(skb, nh_ofs + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + key->ip.proto = NEXTHDR_NONE; + key->ip.tos = ipv6_get_dsfield(nh); + key->ip.ttl = nh->hop_limit; + key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + key->ipv6.addr.src = nh->saddr; + key->ipv6.addr.dst = nh->daddr; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + if (unlikely(payload_ofs < 0)) + return -EINVAL; + + if (frag_off) { + if (frag_off & htons(~0x7)) + key->ip.frag = OVS_FRAG_TYPE_LATER; + else + key->ip.frag = OVS_FRAG_TYPE_FIRST; + } + + nh_len = payload_ofs - nh_ofs; + skb_set_transport_header(skb, nh_ofs + nh_len); + key->ip.proto = nexthdr; + return nh_len; +} + +static bool icmp6hdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmp6hdr)); +} + +#define TCP_FLAGS_OFFSET 13 +#define TCP_FLAG_MASK 0x3f + +void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) +{ + u8 tcp_flags = 0; + + if (flow->key.eth.type == htons(ETH_P_IP) && + flow->key.ip.proto == IPPROTO_TCP) { + u8 *tcp = (u8 *)tcp_hdr(skb); + tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; + } + + spin_lock(&flow->lock); + flow->used = jiffies; + flow->packet_count++; + flow->byte_count += skb->len; + flow->tcp_flags |= tcp_flags; + spin_unlock(&flow->lock); +} + +struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions) +{ + int actions_len = nla_len(actions); + struct sw_flow_actions *sfa; + + /* At least DP_MAX_PORTS actions are required to be able to flood a + * packet to every port. Factor of 2 allows for setting VLAN tags, + * etc. */ + if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4)) + return ERR_PTR(-EINVAL); + + sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = actions_len; + memcpy(sfa->actions, nla_data(actions), actions_len); + return sfa; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&flow->lock); + flow->sf_acts = NULL; + + return flow; +} + +static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) +{ + hash = jhash_1word(hash, table->hash_seed); + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head *), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + table->node_ver = 0; + table->keep_flows = false; + get_random_bytes(&table->hash_seed, sizeof(u32)); + + return table; +} + +void ovs_flow_tbl_destroy(struct flow_table *table) +{ + int i; + + if (!table) + return; + + if (table->keep_flows) + goto skip_flows; + + for (i = 0; i < table->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *node, *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_flow_free(flow); + } + } + +skip_flows: + free_buckets(table->buckets); + kfree(table); +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct flow_table *table = container_of(rcu, struct flow_table, rcu); + + ovs_flow_tbl_destroy(table); +} + +void ovs_flow_tbl_deferred_destroy(struct flow_table *table) +{ + if (!table) + return; + + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); +} + +struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + struct hlist_node *n; + int ver; + int i; + + ver = table->node_ver; + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + struct hlist_node *n; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, n, head, hash_node[old_ver]) + ovs_flow_tbl_insert(new, flow); + } + old->keep_flows = true; +} + +static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets) +{ + struct flow_table *new_table; + + new_table = ovs_flow_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + flow_table_copy_flows(table, new_table); + + return new_table; +} + +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets); +} + +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets * 2); +} + +void ovs_flow_free(struct sw_flow *flow) +{ + if (unlikely(!flow)) + return; + + kfree((struct sf_flow_acts __force *)flow->sf_acts); + kmem_cache_free(flow_cache, flow); +} + +/* RCU callback used by ovs_flow_deferred_free. */ +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + ovs_flow_free(flow); +} + +/* Schedules 'flow' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_flow_deferred_free(struct sw_flow *flow) +{ + call_rcu(&flow->rcu, rcu_free_flow_callback); +} + +/* RCU callback used by ovs_flow_deferred_free_acts. */ +static void rcu_free_acts_callback(struct rcu_head *rcu) +{ + struct sw_flow_actions *sf_acts = container_of(rcu, + struct sw_flow_actions, rcu); + kfree(sf_acts); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, rcu_free_acts_callback); +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct qtag_prefix { + __be16 eth_type; /* ETH_P_8021Q */ + __be16 tci; + }; + struct qtag_prefix *qp; + + if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + + sizeof(__be16)))) + return -ENOMEM; + + qp = (struct qtag_prefix *) skb->data; + key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); + __skb_pull(skb, sizeof(struct qtag_prefix)); + + return 0; +} + +static __be16 parse_ethertype(struct sk_buff *skb) +{ + struct llc_snap_hdr { + u8 dsap; /* Always 0xAA */ + u8 ssap; /* Always 0xAA */ + u8 ctrl; + u8 oui[3]; + __be16 ethertype; + }; + struct llc_snap_hdr *llc; + __be16 proto; + + proto = *(__be16 *) skb->data; + __skb_pull(skb, sizeof(__be16)); + + if (ntohs(proto) >= 1536) + return proto; + + if (skb->len < sizeof(struct llc_snap_hdr)) + return htons(ETH_P_802_2); + + if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) + return htons(0); + + llc = (struct llc_snap_hdr *) skb->data; + if (llc->dsap != LLC_SAP_SNAP || + llc->ssap != LLC_SAP_SNAP || + (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) + return htons(ETH_P_802_2); + + __skb_pull(skb, sizeof(struct llc_snap_hdr)); + return llc->ethertype; +} + +static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, + int *key_lenp, int nh_len) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + int error = 0; + int key_len; + + /* The ICMPv6 type and code fields use the 16-bit transport port + * fields, so we need to store them in 16-bit network byte order. + */ + key->ipv6.tp.src = htons(icmp->icmp6_type); + key->ipv6.tp.dst = htons(icmp->icmp6_code); + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + + if (icmp->icmp6_code == 0 && + (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { + int icmp_len = skb->len - skb_transport_offset(skb); + struct nd_msg *nd; + int offset; + + key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + + /* In order to process neighbor discovery options, we need the + * entire packet. + */ + if (unlikely(icmp_len < sizeof(*nd))) + goto out; + if (unlikely(skb_linearize(skb))) { + error = -ENOMEM; + goto out; + } + + nd = (struct nd_msg *)skb_transport_header(skb); + key->ipv6.nd.target = nd->target; + key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + + icmp_len -= sizeof(*nd); + offset = 0; + while (icmp_len >= 8) { + struct nd_opt_hdr *nd_opt = + (struct nd_opt_hdr *)(nd->opt + offset); + int opt_len = nd_opt->nd_opt_len * 8; + + if (unlikely(!opt_len || opt_len > icmp_len)) + goto invalid; + + /* Store the link layer address if the appropriate + * option is provided. It is considered an error if + * the same link layer option is specified twice. + */ + if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) + goto invalid; + memcpy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) + goto invalid; + memcpy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + } + + icmp_len -= opt_len; + offset += opt_len; + } + } + + goto out; + +invalid: + memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); + memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); + memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); + +out: + *key_lenp = key_len; + return error; +} + +/** + * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @in_port: port number on which @skb was received. + * @key: output flow key + * @key_lenp: length of output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header pointers as follows: + * + * - skb->mac_header: the Ethernet header. + * + * - skb->network_header: just past the Ethernet header, or just past the + * VLAN header, to the first byte of the Ethernet payload. + * + * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->dl_type values it is left untouched. + */ +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, + int *key_lenp) +{ + int error = 0; + int key_len = SW_FLOW_KEY_OFFSET(eth); + struct ethhdr *eth; + + memset(key, 0, sizeof(*key)); + + key->phy.priority = skb->priority; + key->phy.in_port = in_port; + + skb_reset_mac_header(skb); + + /* Link layer. We are guaranteed to have at least the 14 byte Ethernet + * header in the linear data area. + */ + eth = eth_hdr(skb); + memcpy(key->eth.src, eth->h_source, ETH_ALEN); + memcpy(key->eth.dst, eth->h_dest, ETH_ALEN); + + __skb_pull(skb, 2 * ETH_ALEN); + + if (vlan_tx_tag_present(skb)) + key->eth.tci = htons(skb->vlan_tci); + else if (eth->h_proto == htons(ETH_P_8021Q)) + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + + /* Network layer. */ + if (key->eth.type == htons(ETH_P_IP)) { + struct iphdr *nh; + __be16 offset; + + key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); + + error = check_iphdr(skb); + if (unlikely(error)) { + if (error == -EINVAL) { + skb->transport_header = skb->network_header; + error = 0; + } + goto out; + } + + nh = ip_hdr(skb); + key->ipv4.addr.src = nh->saddr; + key->ipv4.addr.dst = nh->daddr; + + key->ip.proto = nh->protocol; + key->ip.tos = nh->tos; + key->ip.ttl = nh->ttl; + + offset = nh->frag_off & htons(IP_OFFSET); + if (offset) { + key->ip.frag = OVS_FRAG_TYPE_LATER; + goto out; + } + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == IPPROTO_TCP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->ipv4.tp.src = tcp->source; + key->ipv4.tp.dst = tcp->dest; + } + } else if (key->ip.proto == IPPROTO_UDP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->ipv4.tp.src = udp->source; + key->ipv4.tp.dst = udp->dest; + } + } else if (key->ip.proto == IPPROTO_ICMP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (icmphdr_ok(skb)) { + struct icmphdr *icmp = icmp_hdr(skb); + /* The ICMP type and code fields use the 16-bit + * transport port fields, so we need to store + * them in 16-bit network byte order. */ + key->ipv4.tp.src = htons(icmp->type); + key->ipv4.tp.dst = htons(icmp->code); + } + } + + } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) { + struct arp_eth_header *arp; + + arp = (struct arp_eth_header *)skb_network_header(skb); + + if (arp->ar_hrd == htons(ARPHRD_ETHER) + && arp->ar_pro == htons(ETH_P_IP) + && arp->ar_hln == ETH_ALEN + && arp->ar_pln == 4) { + + /* We only match on the lower 8 bits of the opcode. */ + if (ntohs(arp->ar_op) <= 0xff) + key->ip.proto = ntohs(arp->ar_op); + + if (key->ip.proto == ARPOP_REQUEST + || key->ip.proto == ARPOP_REPLY) { + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); + memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); + memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); + memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); + key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); + } + } + } else if (key->eth.type == htons(ETH_P_IPV6)) { + int nh_len; /* IPv6 Header + Extensions */ + + nh_len = parse_ipv6hdr(skb, key, &key_len); + if (unlikely(nh_len < 0)) { + if (nh_len == -EINVAL) + skb->transport_header = skb->network_header; + else + error = nh_len; + goto out; + } + + if (key->ip.frag == OVS_FRAG_TYPE_LATER) + goto out; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == NEXTHDR_TCP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->ipv6.tp.src = tcp->source; + key->ipv6.tp.dst = tcp->dest; + } + } else if (key->ip.proto == NEXTHDR_UDP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->ipv6.tp.src = udp->source; + key->ipv6.tp.dst = udp->dest; + } + } else if (key->ip.proto == NEXTHDR_ICMP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (icmp6hdr_ok(skb)) { + error = parse_icmpv6(skb, key, &key_len, nh_len); + if (error < 0) + goto out; + } + } + } + +out: + *key_lenp = key_len; + return error; +} + +u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len) +{ + return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0); +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, + struct sw_flow_key *key, int key_len) +{ + struct sw_flow *flow; + struct hlist_node *n; + struct hlist_head *head; + u32 hash; + + hash = ovs_flow_hash(key, key_len); + + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) { + + if (flow->hash == hash && + !memcmp(&flow->key, key, key_len)) { + return flow; + } + } + return NULL; +} + +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; +} + +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + hlist_del_rcu(&flow->hash_node[table->node_ver]); + table->count--; + BUG_ON(table->count < 0); +} + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = -1, + [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), + [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), + [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), + [OVS_KEY_ATTR_VLAN] = sizeof(__be16), + [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), + [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), + [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), + [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), + [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), + [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), + [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), + [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), +}; + +static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, + const struct nlattr *a[], u32 *attrs) +{ + const struct ovs_key_icmp *icmp_key; + const struct ovs_key_tcp *tcp_key; + const struct ovs_key_udp *udp_key; + + switch (swkey->ip.proto) { + case IPPROTO_TCP: + if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TCP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + swkey->ipv4.tp.src = tcp_key->tcp_src; + swkey->ipv4.tp.dst = tcp_key->tcp_dst; + break; + + case IPPROTO_UDP: + if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + swkey->ipv4.tp.src = udp_key->udp_src; + swkey->ipv4.tp.dst = udp_key->udp_dst; + break; + + case IPPROTO_ICMP: + if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + swkey->ipv4.tp.src = htons(icmp_key->icmp_type); + swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); + break; + } + + return 0; +} + +static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, + const struct nlattr *a[], u32 *attrs) +{ + const struct ovs_key_icmpv6 *icmpv6_key; + const struct ovs_key_tcp *tcp_key; + const struct ovs_key_udp *udp_key; + + switch (swkey->ip.proto) { + case IPPROTO_TCP: + if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TCP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + swkey->ipv6.tp.src = tcp_key->tcp_src; + swkey->ipv6.tp.dst = tcp_key->tcp_dst; + break; + + case IPPROTO_UDP: + if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + swkey->ipv6.tp.src = udp_key->udp_src; + swkey->ipv6.tp.dst = udp_key->udp_dst; + break; + + case IPPROTO_ICMPV6: + if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); + swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); + + if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + const struct ovs_key_nd *nd_key; + + if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ND); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, + sizeof(swkey->ipv6.nd.target)); + memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); + memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); + } + break; + } + + return 0; +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u32 *attrsp) +{ + const struct nlattr *nla; + u32 attrs; + int rem; + + attrs = 0; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) + return -EINVAL; + + expected_len = ovs_key_lens[type]; + if (nla_len(nla) != expected_len && expected_len != -1) + return -EINVAL; + + attrs |= 1 << type; + a[type] = nla; + } + if (rem) + return -EINVAL; + + *attrsp = attrs; + return 0; +} + +/** + * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. + * @swkey: receives the extracted flow key. + * @key_lenp: number of bytes used in @swkey. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + */ +int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, + const struct nlattr *attr) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct ovs_key_ethernet *eth_key; + int key_len; + u32 attrs; + int err; + + memset(swkey, 0, sizeof(struct sw_flow_key)); + key_len = SW_FLOW_KEY_OFFSET(eth); + + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) + return err; + + /* Metadata attributes. */ + if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); + attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + if (in_port >= DP_MAX_PORTS) + return -EINVAL; + swkey->phy.in_port = in_port; + attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else { + swkey->phy.in_port = USHRT_MAX; + } + + /* Data attributes. */ + if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); + memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); + + if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && + nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { + const struct nlattr *encap; + __be16 tci; + + if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | + (1 << OVS_KEY_ATTR_ETHERTYPE) | + (1 << OVS_KEY_ATTR_ENCAP))) + return -EINVAL; + + encap = a[OVS_KEY_ATTR_ENCAP]; + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (tci & htons(VLAN_TAG_PRESENT)) { + swkey->eth.tci = tci; + + err = parse_flow_nlattrs(encap, a, &attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) + return -EINVAL; + + swkey->eth.type = htons(ETH_P_8021Q); + *key_lenp = key_len; + return 0; + } else { + return -EINVAL; + } + } + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (ntohs(swkey->eth.type) < 1536) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else { + swkey->eth.type = htons(ETH_P_802_2); + } + + if (swkey->eth.type == htons(ETH_P_IP)) { + const struct ovs_key_ipv4 *ipv4_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + + key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) + return -EINVAL; + swkey->ip.proto = ipv4_key->ipv4_proto; + swkey->ip.tos = ipv4_key->ipv4_tos; + swkey->ip.ttl = ipv4_key->ipv4_ttl; + swkey->ip.frag = ipv4_key->ipv4_frag; + swkey->ipv4.addr.src = ipv4_key->ipv4_src; + swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; + + if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); + if (err) + return err; + } + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + + key_len = SW_FLOW_KEY_OFFSET(ipv6.label); + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) + return -EINVAL; + swkey->ipv6.label = ipv6_key->ipv6_label; + swkey->ip.proto = ipv6_key->ipv6_proto; + swkey->ip.tos = ipv6_key->ipv6_tclass; + swkey->ip.ttl = ipv6_key->ipv6_hlimit; + swkey->ip.frag = ipv6_key->ipv6_frag; + memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, + sizeof(swkey->ipv6.addr.src)); + memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, + sizeof(swkey->ipv6.addr.dst)); + + if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); + if (err) + return err; + } + } else if (swkey->eth.type == htons(ETH_P_ARP)) { + const struct ovs_key_arp *arp_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + + key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + swkey->ipv4.addr.src = arp_key->arp_sip; + swkey->ipv4.addr.dst = arp_key->arp_tip; + if (arp_key->arp_op & htons(0xff00)) + return -EINVAL; + swkey->ip.proto = ntohs(arp_key->arp_op); + memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); + memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } + + if (attrs) + return -EINVAL; + *key_lenp = key_len; + + return 0; +} + +/** + * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. + * @in_port: receives the extracted input port. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, + const struct nlattr *attr) +{ + const struct nlattr *nla; + int rem; + + *in_port = USHRT_MAX; + *priority = 0; + + nla_for_each_nested(nla, attr, rem) { + int type = nla_type(nla); + + if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { + if (nla_len(nla) != ovs_key_lens[type]) + return -EINVAL; + + switch (type) { + case OVS_KEY_ATTR_PRIORITY: + *priority = nla_get_u32(nla); + break; + + case OVS_KEY_ATTR_IN_PORT: + if (nla_get_u32(nla) >= DP_MAX_PORTS) + return -EINVAL; + *in_port = nla_get_u32(nla); + break; + } + } + } + if (rem) + return -EINVAL; + return 0; +} + +int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + + if (swkey->phy.priority) + NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority); + + if (swkey->phy.in_port != USHRT_MAX) + NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port); + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + eth_key = nla_data(nla); + memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)); + NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci); + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else { + encap = NULL; + } + + if (swkey->eth.type == htons(ETH_P_802_2)) + goto unencap; + + NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type); + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = swkey->ipv4.addr.src; + ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; + ipv4_key->ipv4_proto = swkey->ip.proto; + ipv4_key->ipv4_tos = swkey->ip.tos; + ipv4_key->ipv4_ttl = swkey->ip.ttl; + ipv4_key->ipv4_frag = swkey->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = swkey->ipv6.label; + ipv6_key->ipv6_proto = swkey->ip.proto; + ipv6_key->ipv6_tclass = swkey->ip.tos; + ipv6_key->ipv6_hlimit = swkey->ip.ttl; + ipv6_key->ipv6_frag = swkey->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = swkey->ipv4.addr.src; + arp_key->arp_tip = swkey->ipv4.addr.dst; + arp_key->arp_op = htons(swkey->ip.proto); + memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + tcp_key->tcp_src = swkey->ipv4.tp.src; + tcp_key->tcp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + tcp_key->tcp_src = swkey->ipv6.tp.src; + tcp_key->tcp_dst = swkey->ipv6.tp.dst; + } + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + udp_key->udp_src = swkey->ipv4.tp.src; + udp_key->udp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + udp_key->udp_src = swkey->ipv6.tp.src; + udp_key->udp_dst = swkey->ipv6.tp.dst; + } + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); + icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, + sizeof(nd_key->nd_target)); + memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, + 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h new file mode 100644 index 000000000000..2747dc2c4ac1 --- /dev/null +++ b/net/openvswitch/flow.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_H +#define FLOW_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> +#include <net/inet_ecn.h> + +struct sk_buff; + +struct sw_flow_actions { + struct rcu_head rcu; + u32 actions_len; + struct nlattr actions[]; +}; + +struct sw_flow_key { + struct { + u32 priority; /* Packet QoS priority. */ + u16 in_port; /* Input switch port (or USHRT_MAX). */ + } phy; + struct { + u8 src[ETH_ALEN]; /* Ethernet source address. */ + u8 dst[ETH_ALEN]; /* Ethernet destination address. */ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 type; /* Ethernet frame type. */ + } eth; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + union { + struct { + struct { + __be32 src; /* IP source address. */ + __be32 dst; /* IP destination address. */ + } addr; + union { + struct { + __be16 src; /* TCP/UDP source port. */ + __be16 dst; /* TCP/UDP destination port. */ + } tp; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + }; + } ipv4; + struct { + struct { + struct in6_addr src; /* IPv6 source address. */ + struct in6_addr dst; /* IPv6 destination address. */ + } addr; + __be32 label; /* IPv6 flow label. */ + struct { + __be16 src; /* TCP/UDP source port. */ + __be16 dst; /* TCP/UDP destination port. */ + } tp; + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + } ipv6; + }; +}; + +struct sw_flow { + struct rcu_head rcu; + struct hlist_node hash_node[2]; + u32 hash; + + struct sw_flow_key key; + struct sw_flow_actions __rcu *sf_acts; + + spinlock_t lock; /* Lock for values below. */ + unsigned long used; /* Last used time (in jiffies). */ + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + u8 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct arp_eth_header { + __be16 ar_hrd; /* format of hardware address */ + __be16 ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + __be16 ar_op; /* ARP opcode (command) */ + + /* Ethernet+IPv4 specific members. */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ +} __packed; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_deferred_free(struct sw_flow *); +void ovs_flow_free(struct sw_flow *flow); + +struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *); +void ovs_flow_deferred_free_acts(struct sw_flow_actions *); + +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, + int *key_lenp); +void ovs_flow_used(struct sw_flow *, struct sk_buff *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +/* Upper bound on the length of a nlattr-formatted flow key. The longest + * nlattr-formatted flow key would be: + * + * struct pad nl hdr total + * ------ --- ------ ----- + * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 + * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 + * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 + * OVS_KEY_ATTR_8021Q 4 -- 4 8 + * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 + * OVS_KEY_ATTR_IPV6 40 -- 4 44 + * OVS_KEY_ATTR_ICMPV6 2 2 4 8 + * OVS_KEY_ATTR_ND 28 -- 4 32 + * ------------------------------------------------- + * total 132 + */ +#define FLOW_BUFSIZE 132 + +int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); +int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, + const struct nlattr *); +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, + const struct nlattr *); + +#define TBL_MIN_BUCKETS 1024 + +struct flow_table { + struct flex_array *buckets; + unsigned int count, n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +static inline int ovs_flow_tbl_count(struct flow_table *table) +{ + return table->count; +} + +static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) +{ + return (table->count > table->n_buckets); +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, + struct sw_flow_key *key, int len); +void ovs_flow_tbl_destroy(struct flow_table *table); +void ovs_flow_tbl_deferred_destroy(struct flow_table *table); +struct flow_table *ovs_flow_tbl_alloc(int new_size); +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len); + +struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); +extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; + +#endif /* flow.h */ diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c new file mode 100644 index 000000000000..8fc28b86f2b3 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/hardirq.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/skbuff.h> +#include <linux/version.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +struct internal_dev { + struct vport *vport; +}; + +static struct internal_dev *internal_dev_priv(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +/* This function is only called by the kernel network layer.*/ +static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct vport *vport = ovs_internal_dev_get_vport(netdev); + struct ovs_vport_stats vport_stats; + + ovs_vport_get_stats(vport, &vport_stats); + + /* The tx and rx stats need to be swapped because the + * switch and host OS have opposite perspectives. */ + stats->rx_packets = vport_stats.tx_packets; + stats->tx_packets = vport_stats.rx_packets; + stats->rx_bytes = vport_stats.tx_bytes; + stats->tx_bytes = vport_stats.rx_bytes; + stats->rx_errors = vport_stats.tx_errors; + stats->tx_errors = vport_stats.rx_errors; + stats->rx_dropped = vport_stats.tx_dropped; + stats->tx_dropped = vport_stats.rx_dropped; + + return stats; +} + +static int internal_dev_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + return 0; +} + +/* Called with rcu_read_lock_bh. */ +static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + rcu_read_lock(); + ovs_vport_receive(internal_dev_priv(netdev)->vport, skb); + rcu_read_unlock(); + return 0; +} + +static int internal_dev_open(struct net_device *netdev) +{ + netif_start_queue(netdev); + return 0; +} + +static int internal_dev_stop(struct net_device *netdev) +{ + netif_stop_queue(netdev); + return 0; +} + +static void internal_dev_getinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "openvswitch"); +} + +static const struct ethtool_ops internal_dev_ethtool_ops = { + .get_drvinfo = internal_dev_getinfo, + .get_link = ethtool_op_get_link, +}; + +static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < 68) + return -EINVAL; + + netdev->mtu = new_mtu; + return 0; +} + +static void internal_dev_destructor(struct net_device *dev) +{ + struct vport *vport = ovs_internal_dev_get_vport(dev); + + ovs_vport_free(vport); + free_netdev(dev); +} + +static const struct net_device_ops internal_dev_netdev_ops = { + .ndo_open = internal_dev_open, + .ndo_stop = internal_dev_stop, + .ndo_start_xmit = internal_dev_xmit, + .ndo_set_mac_address = internal_dev_mac_addr, + .ndo_change_mtu = internal_dev_change_mtu, + .ndo_get_stats64 = internal_dev_get_stats, +}; + +static void do_setup(struct net_device *netdev) +{ + ether_setup(netdev); + + netdev->netdev_ops = &internal_dev_netdev_ops; + + netdev->priv_flags &= ~IFF_TX_SKB_SHARING; + netdev->destructor = internal_dev_destructor; + SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); + netdev->tx_queue_len = 0; + + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; + + netdev->vlan_features = netdev->features; + netdev->features |= NETIF_F_HW_VLAN_TX; + netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + random_ether_addr(netdev->dev_addr); +} + +static struct vport *internal_dev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + struct internal_dev *internal_dev; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_internal_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, do_setup); + if (!netdev_vport->dev) { + err = -ENOMEM; + goto error_free_vport; + } + + internal_dev = internal_dev_priv(netdev_vport->dev); + internal_dev->vport = vport; + + err = register_netdevice(netdev_vport->dev); + if (err) + goto error_free_netdev; + + dev_set_promiscuity(netdev_vport->dev, 1); + netif_start_queue(netdev_vport->dev); + + return vport; + +error_free_netdev: + free_netdev(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void internal_dev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + netif_stop_queue(netdev_vport->dev); + dev_set_promiscuity(netdev_vport->dev, -1); + + /* unregister_netdevice() waits for an RCU grace period. */ + unregister_netdevice(netdev_vport->dev); +} + +static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +{ + struct net_device *netdev = netdev_vport_priv(vport)->dev; + int len; + + len = skb->len; + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, netdev); + + netif_rx(skb); + + return len; +} + +const struct vport_ops ovs_internal_vport_ops = { + .type = OVS_VPORT_TYPE_INTERNAL, + .create = internal_dev_create, + .destroy = internal_dev_destroy, + .get_name = ovs_netdev_get_name, + .get_ifindex = ovs_netdev_get_ifindex, + .send = internal_dev_recv, +}; + +int ovs_is_internal_dev(const struct net_device *netdev) +{ + return netdev->netdev_ops == &internal_dev_netdev_ops; +} + +struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) +{ + if (!ovs_is_internal_dev(netdev)) + return NULL; + + return internal_dev_priv(netdev)->vport; +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h new file mode 100644 index 000000000000..3454447c5f11 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_INTERNAL_DEV_H +#define VPORT_INTERNAL_DEV_H 1 + +#include "datapath.h" +#include "vport.h" + +int ovs_is_internal_dev(const struct net_device *); +struct vport *ovs_internal_dev_get_vport(struct net_device *); + +#endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c new file mode 100644 index 000000000000..c1068aed03d1 --- /dev/null +++ b/net/openvswitch/vport-netdev.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if_arp.h> +#include <linux/if_bridge.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/llc.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <net/llc.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +/* Must be called with rcu_read_lock. */ +static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +{ + if (unlikely(!vport)) { + kfree_skb(skb); + return; + } + + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + * (No one comes after us, since we tell handle_bridge() that we took + * the packet.) */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return; + + skb_push(skb, ETH_HLEN); + ovs_vport_receive(vport, skb); +} + +/* Called with rcu_read_lock and bottom-halves disabled. */ +static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct vport *vport; + + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + + vport = ovs_netdev_get_vport(skb->dev); + + netdev_port_receive(vport, skb); + + return RX_HANDLER_CONSUMED; +} + +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = dev_get_by_name(&init_net, parms->name); + if (!netdev_vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + + if (netdev_vport->dev->flags & IFF_LOOPBACK || + netdev_vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(netdev_vport->dev)) { + err = -EINVAL; + goto error_put; + } + + err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + vport); + if (err) + goto error_put; + + dev_set_promiscuity(netdev_vport->dev, 1); + netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + + return vport; + +error_put: + dev_put(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void netdev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(netdev_vport->dev); + dev_set_promiscuity(netdev_vport->dev, -1); + + synchronize_rcu(); + + dev_put(netdev_vport->dev); + ovs_vport_free(vport); +} + +const char *ovs_netdev_get_name(const struct vport *vport) +{ + const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + return netdev_vport->dev->name; +} + +int ovs_netdev_get_ifindex(const struct vport *vport) +{ + const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + return netdev_vport->dev->ifindex; +} + +static unsigned packet_length(const struct sk_buff *skb) +{ + unsigned length = skb->len - ETH_HLEN; + + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + + return length; +} + +static int netdev_send(struct vport *vport, struct sk_buff *skb) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + int mtu = netdev_vport->dev->mtu; + int len; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + if (net_ratelimit()) + pr_warn("%s: dropped over-mtu packet: %d > %d\n", + ovs_dp_name(vport->dp), packet_length(skb), mtu); + goto error; + } + + if (unlikely(skb_warn_if_lro(skb))) + goto error; + + skb->dev = netdev_vport->dev; + len = skb->len; + dev_queue_xmit(skb); + + return len; + +error: + kfree_skb(skb); + ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); + return 0; +} + +/* Returns null if this device is not attached to a datapath. */ +struct vport *ovs_netdev_get_vport(struct net_device *dev) +{ + if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + return (struct vport *) + rcu_dereference_rtnl(dev->rx_handler_data); + else + return NULL; +} + +const struct vport_ops ovs_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_NETDEV, + .create = netdev_create, + .destroy = netdev_destroy, + .get_name = ovs_netdev_get_name, + .get_ifindex = ovs_netdev_get_ifindex, + .send = netdev_send, +}; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h new file mode 100644 index 000000000000..fd9b008a0e6e --- /dev/null +++ b/net/openvswitch/vport-netdev.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_NETDEV_H +#define VPORT_NETDEV_H 1 + +#include <linux/netdevice.h> + +#include "vport.h" + +struct vport *ovs_netdev_get_vport(struct net_device *dev); + +struct netdev_vport { + struct net_device *dev; +}; + +static inline struct netdev_vport * +netdev_vport_priv(const struct vport *vport) +{ + return vport_priv(vport); +} + +const char *ovs_netdev_get_name(const struct vport *); +const char *ovs_netdev_get_config(const struct vport *); +int ovs_netdev_get_ifindex(const struct vport *); + +#endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 000000000000..6cd760131f15 --- /dev/null +++ b/net/openvswitch/vport.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/dcache.h> +#include <linux/etherdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/compat.h> +#include <linux/version.h> + +#include "vport.h" +#include "vport-internal_dev.h" + +/* List of statically compiled vport implementations. Don't forget to also + * add yours to the list at the bottom of vport.h. */ +static const struct vport_ops *vport_ops_list[] = { + &ovs_netdev_vport_ops, + &ovs_internal_vport_ops, +}; + +/* Protected by RCU read lock for reading, RTNL lock for writing. */ +static struct hlist_head *dev_table; +#define VPORT_HASH_BUCKETS 1024 + +/** + * ovs_vport_init - initialize vport subsystem + * + * Called at module load time to initialize the vport subsystem. + */ +int ovs_vport_init(void) +{ + dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dev_table) + return -ENOMEM; + + return 0; +} + +/** + * ovs_vport_exit - shutdown vport subsystem + * + * Called at module exit time to shutdown the vport subsystem. + */ +void ovs_vport_exit(void) +{ + kfree(dev_table); +} + +static struct hlist_head *hash_bucket(const char *name) +{ + unsigned int hash = full_name_hash(name, strlen(name)); + return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; +} + +/** + * ovs_vport_locate - find a port that has already been created + * + * @name: name of port to find + * + * Must be called with RTNL or RCU read lock. + */ +struct vport *ovs_vport_locate(const char *name) +{ + struct hlist_head *bucket = hash_bucket(name); + struct vport *vport; + struct hlist_node *node; + + hlist_for_each_entry_rcu(vport, node, bucket, hash_node) + if (!strcmp(name, vport->ops->get_name(vport))) + return vport; + + return NULL; +} + +/** + * ovs_vport_alloc - allocate and initialize new vport + * + * @priv_size: Size of private data area to allocate. + * @ops: vport device ops + * + * Allocate and initialize a new vport defined by @ops. The vport will contain + * a private data area of size @priv_size that can be accessed using + * vport_priv(). vports that are no longer needed should be released with + * vport_free(). + */ +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, + const struct vport_parms *parms) +{ + struct vport *vport; + size_t alloc_size; + + alloc_size = sizeof(struct vport); + if (priv_size) { + alloc_size = ALIGN(alloc_size, VPORT_ALIGN); + alloc_size += priv_size; + } + + vport = kzalloc(alloc_size, GFP_KERNEL); + if (!vport) + return ERR_PTR(-ENOMEM); + + vport->dp = parms->dp; + vport->port_no = parms->port_no; + vport->upcall_pid = parms->upcall_pid; + vport->ops = ops; + + vport->percpu_stats = alloc_percpu(struct vport_percpu_stats); + if (!vport->percpu_stats) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&vport->stats_lock); + + return vport; +} + +/** + * ovs_vport_free - uninitialize and free vport + * + * @vport: vport to free + * + * Frees a vport allocated with vport_alloc() when it is no longer needed. + * + * The caller must ensure that an RCU grace period has passed since the last + * time @vport was in a datapath. + */ +void ovs_vport_free(struct vport *vport) +{ + free_percpu(vport->percpu_stats); + kfree(vport); +} + +/** + * ovs_vport_add - add vport device (for kernel callers) + * + * @parms: Information about new vport. + * + * Creates a new vport with the specified configuration (which is dependent on + * device type). RTNL lock must be held. + */ +struct vport *ovs_vport_add(const struct vport_parms *parms) +{ + struct vport *vport; + int err = 0; + int i; + + ASSERT_RTNL(); + + for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { + if (vport_ops_list[i]->type == parms->type) { + vport = vport_ops_list[i]->create(parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto out; + } + + hlist_add_head_rcu(&vport->hash_node, + hash_bucket(vport->ops->get_name(vport))); + return vport; + } + } + + err = -EAFNOSUPPORT; + +out: + return ERR_PTR(err); +} + +/** + * ovs_vport_set_options - modify existing vport device (for kernel callers) + * + * @vport: vport to modify. + * @port: New configuration. + * + * Modifies an existing device with the specified configuration (which is + * dependent on device type). RTNL lock must be held. + */ +int ovs_vport_set_options(struct vport *vport, struct nlattr *options) +{ + ASSERT_RTNL(); + + if (!vport->ops->set_options) + return -EOPNOTSUPP; + return vport->ops->set_options(vport, options); +} + +/** + * ovs_vport_del - delete existing vport device + * + * @vport: vport to delete. + * + * Detaches @vport from its datapath and destroys it. It is possible to fail + * for reasons such as lack of memory. RTNL lock must be held. + */ +void ovs_vport_del(struct vport *vport) +{ + ASSERT_RTNL(); + + hlist_del_rcu(&vport->hash_node); + + vport->ops->destroy(vport); +} + +/** + * ovs_vport_get_stats - retrieve device stats + * + * @vport: vport from which to retrieve the stats + * @stats: location to store stats + * + * Retrieves transmit, receive, and error stats for the given device. + * + * Must be called with RTNL lock or rcu_read_lock. + */ +void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + + /* We potentially have 2 sources of stats that need to be combined: + * those we have collected (split into err_stats and percpu_stats) from + * set_stats() and device error stats from netdev->get_stats() (for + * errors that happen downstream and therefore aren't reported through + * our vport_record_error() function). + * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). + * netdev-stats can be directly read over netlink-ioctl. + */ + + spin_lock_bh(&vport->stats_lock); + + stats->rx_errors = vport->err_stats.rx_errors; + stats->tx_errors = vport->err_stats.tx_errors; + stats->tx_dropped = vport->err_stats.tx_dropped; + stats->rx_dropped = vport->err_stats.rx_dropped; + + spin_unlock_bh(&vport->stats_lock); + + for_each_possible_cpu(i) { + const struct vport_percpu_stats *percpu_stats; + struct vport_percpu_stats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + + do { + start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } +} + +/** + * ovs_vport_get_options - retrieve device options + * + * @vport: vport from which to retrieve the options. + * @skb: sk_buff where options should be appended. + * + * Retrieves the configuration of the given device, appending an + * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested + * vport-specific attributes to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another + * negative error code if a real error occurred. If an error occurs, @skb is + * left unmodified. + * + * Must be called with RTNL lock or rcu_read_lock. + */ +int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + if (!nla) + return -EMSGSIZE; + + if (vport->ops->get_options) { + int err = vport->ops->get_options(vport, skb); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + } + + nla_nest_end(skb, nla); + return 0; +} + +/** + * ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * + * Must be called with rcu_read_lock. The packet cannot be shared and + * skb->data should point to the Ethernet header. The caller must have already + * called compute_ip_summed() to initialize the checksumming fields. + */ +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) +{ + struct vport_percpu_stats *stats; + + stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id()); + + u64_stats_update_begin(&stats->sync); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->sync); + + ovs_dp_process_received_packet(vport, skb); +} + +/** + * ovs_vport_send - send a packet on a device + * + * @vport: vport on which to send the packet + * @skb: skb to send + * + * Sends the given packet and returns the length of data sent. Either RTNL + * lock or rcu_read_lock must be held. + */ +int ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + int sent = vport->ops->send(vport, skb); + + if (likely(sent)) { + struct vport_percpu_stats *stats; + + stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id()); + + u64_stats_update_begin(&stats->sync); + stats->tx_packets++; + stats->tx_bytes += sent; + u64_stats_update_end(&stats->sync); + } + return sent; +} + +/** + * ovs_vport_record_error - indicate device error to generic stats layer + * + * @vport: vport that encountered the error + * @err_type: one of enum vport_err_type types to indicate the error type + * + * If using the vport generic stats layer indicate that an error of the given + * type has occured. + */ +void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) +{ + spin_lock(&vport->stats_lock); + + switch (err_type) { + case VPORT_E_RX_DROPPED: + vport->err_stats.rx_dropped++; + break; + + case VPORT_E_RX_ERROR: + vport->err_stats.rx_errors++; + break; + + case VPORT_E_TX_DROPPED: + vport->err_stats.tx_dropped++; + break; + + case VPORT_E_TX_ERROR: + vport->err_stats.tx_errors++; + break; + }; + + spin_unlock(&vport->stats_lock); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h new file mode 100644 index 000000000000..19609629dabd --- /dev/null +++ b/net/openvswitch/vport.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2007-2011 Nicira Networks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_H +#define VPORT_H 1 + +#include <linux/list.h> +#include <linux/openvswitch.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/u64_stats_sync.h> + +#include "datapath.h" + +struct vport; +struct vport_parms; + +/* The following definitions are for users of the vport subsytem: */ + +int ovs_vport_init(void); +void ovs_vport_exit(void); + +struct vport *ovs_vport_add(const struct vport_parms *); +void ovs_vport_del(struct vport *); + +struct vport *ovs_vport_locate(const char *name); + +void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); + +int ovs_vport_set_options(struct vport *, struct nlattr *options); +int ovs_vport_get_options(const struct vport *, struct sk_buff *); + +int ovs_vport_send(struct vport *, struct sk_buff *); + +/* The following definitions are for implementers of vport devices: */ + +struct vport_percpu_stats { + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; + struct u64_stats_sync sync; +}; + +struct vport_err_stats { + u64 rx_dropped; + u64 rx_errors; + u64 tx_dropped; + u64 tx_errors; +}; + +/** + * struct vport - one port within a datapath + * @rcu: RCU callback head for deferred destruction. + * @port_no: Index into @dp's @ports array. + * @dp: Datapath to which this port belongs. + * @node: Element in @dp's @port_list. + * @upcall_pid: The Netlink port to use for packets received on this port that + * miss the flow table. + * @hash_node: Element in @dev_table hash table in vport.c. + * @ops: Class structure. + * @percpu_stats: Points to per-CPU statistics used and maintained by vport + * @stats_lock: Protects @err_stats; + * @err_stats: Points to error statistics used and maintained by vport + */ +struct vport { + struct rcu_head rcu; + u16 port_no; + struct datapath *dp; + struct list_head node; + u32 upcall_pid; + + struct hlist_node hash_node; + const struct vport_ops *ops; + + struct vport_percpu_stats __percpu *percpu_stats; + + spinlock_t stats_lock; + struct vport_err_stats err_stats; +}; + +/** + * struct vport_parms - parameters for creating a new vport + * + * @name: New vport's name. + * @type: New vport's type. + * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if + * none was supplied. + * @dp: New vport's datapath. + * @port_no: New vport's port number. + */ +struct vport_parms { + const char *name; + enum ovs_vport_type type; + struct nlattr *options; + + /* For ovs_vport_alloc(). */ + struct datapath *dp; + u16 port_no; + u32 upcall_pid; +}; + +/** + * struct vport_ops - definition of a type of virtual port + * + * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. + * @create: Create a new vport configured as specified. On success returns + * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. + * @destroy: Destroys a vport. Must call vport_free() on the vport but not + * before an RCU grace period has elapsed. + * @set_options: Modify the configuration of an existing vport. May be %NULL + * if modification is not supported. + * @get_options: Appends vport-specific attributes for the configuration of an + * existing vport to a &struct sk_buff. May be %NULL for a vport that does not + * have any configuration. + * @get_name: Get the device's name. + * @get_config: Get the device's configuration. + * @get_ifindex: Get the system interface index associated with the device. + * May be null if the device does not have an ifindex. + * @send: Send a packet on the device. Returns the length of the packet sent. + */ +struct vport_ops { + enum ovs_vport_type type; + + /* Called with RTNL lock. */ + struct vport *(*create)(const struct vport_parms *); + void (*destroy)(struct vport *); + + int (*set_options)(struct vport *, struct nlattr *); + int (*get_options)(const struct vport *, struct sk_buff *); + + /* Called with rcu_read_lock or RTNL lock. */ + const char *(*get_name)(const struct vport *); + void (*get_config)(const struct vport *, void *); + int (*get_ifindex)(const struct vport *); + + int (*send)(struct vport *, struct sk_buff *); +}; + +enum vport_err_type { + VPORT_E_RX_DROPPED, + VPORT_E_RX_ERROR, + VPORT_E_TX_DROPPED, + VPORT_E_TX_ERROR, +}; + +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, + const struct vport_parms *); +void ovs_vport_free(struct vport *); + +#define VPORT_ALIGN 8 + +/** + * vport_priv - access private data area of vport + * + * @vport: vport to access + * + * If a nonzero size was passed in priv_size of vport_alloc() a private data + * area was allocated on creation. This allows that area to be accessed and + * used for any purpose needed by the vport implementer. + */ +static inline void *vport_priv(const struct vport *vport) +{ + return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); +} + +/** + * vport_from_priv - lookup vport from private data pointer + * + * @priv: Start of private data area. + * + * It is sometimes useful to translate from a pointer to the private data + * area to the vport, such as in the case where the private data pointer is + * the result of a hash table lookup. @priv must point to the start of the + * private data area. + */ +static inline struct vport *vport_from_priv(const void *priv) +{ + return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); +} + +void ovs_vport_receive(struct vport *, struct sk_buff *); +void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); + +/* List of statically compiled vport implementations. Don't forget to also + * add yours to the list at the top of vport.c. */ +extern const struct vport_ops ovs_netdev_vport_ops; +extern const struct vport_ops ovs_internal_vport_ops; + +#endif /* vport.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 82a6f34d39d0..0da505c9ac23 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1499,10 +1499,11 @@ retry: if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); - skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL); + skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard @@ -1944,7 +1945,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, int size_max, - __be16 proto, unsigned char *addr) + __be16 proto, unsigned char *addr, int hlen) { union { struct tpacket_hdr *h1; @@ -1978,7 +1979,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, return -EMSGSIZE; } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, hlen); skb_reset_network_header(skb); data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); @@ -2053,6 +2054,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) unsigned char *addr; int len_sum = 0; int status = 0; + int hlen, tlen; mutex_lock(&po->pg_vec_lock); @@ -2101,16 +2103,17 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } status = TP_STATUS_SEND_REQUEST; + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, - LL_ALLOCATED_SPACE(dev) - + sizeof(struct sockaddr_ll), + hlen + tlen + sizeof(struct sockaddr_ll), 0, &err); if (unlikely(skb == NULL)) goto out_status; tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, - addr); + addr, hlen); if (unlikely(tp_len < 0)) { if (po->tp_loss) { @@ -2207,6 +2210,7 @@ static int packet_snd(struct socket *sock, int vnet_hdr_len; struct packet_sock *po = pkt_sk(sk); unsigned short gso_type = 0; + int hlen, tlen; /* * Get and verify the address. @@ -2291,8 +2295,9 @@ static int packet_snd(struct socket *sock, goto out_unlock; err = -ENOBUFS; - skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), - LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 2ba6e9fb4cbc..9f60008740e3 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -534,6 +534,29 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) return pipe_handler_send_created_ind(sk); } +static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pnpipehdr *hdr = pnp_hdr(skb); + + if (hdr->error_code != PN_PIPE_NO_ERROR) + return -ECONNREFUSED; + + return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */, + NULL, 0, GFP_ATOMIC); + +} + +static void pipe_start_flow_control(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + + if (!pn_flow_safe(pn->tx_fc)) { + atomic_set(&pn->tx_credits, 1); + sk->sk_write_space(sk); + } + pipe_grant_credits(sk, GFP_ATOMIC); +} + /* Queue an skb to an actively connected sock. * Socket lock must be held. */ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) @@ -579,13 +602,25 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_state = TCP_CLOSE_WAIT; break; } + if (pn->init_enable == PN_PIPE_DISABLE) + sk->sk_state = TCP_SYN_RECV; + else { + sk->sk_state = TCP_ESTABLISHED; + pipe_start_flow_control(sk); + } + break; - sk->sk_state = TCP_ESTABLISHED; - if (!pn_flow_safe(pn->tx_fc)) { - atomic_set(&pn->tx_credits, 1); - sk->sk_write_space(sk); + case PNS_PEP_ENABLE_RESP: + if (sk->sk_state != TCP_SYN_SENT) + break; + + if (pep_enableresp_rcv(sk, skb)) { + sk->sk_state = TCP_CLOSE_WAIT; + break; } - pipe_grant_credits(sk, GFP_ATOMIC); + + sk->sk_state = TCP_ESTABLISHED; + pipe_start_flow_control(sk); break; case PNS_PEP_DISCONNECT_RESP: @@ -864,14 +899,32 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) int err; u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD }; - pn->pipe_handle = 1; /* anything but INVALID_HANDLE */ + if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE) + pn->pipe_handle = 1; /* anything but INVALID_HANDLE */ + err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ, - PN_PIPE_ENABLE, data, 4); + pn->init_enable, data, 4); if (err) { pn->pipe_handle = PN_PIPE_INVALID_HANDLE; return err; } + sk->sk_state = TCP_SYN_SENT; + + return 0; +} + +static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) +{ + int err; + + err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD, + NULL, 0); + if (err) + return err; + + sk->sk_state = TCP_SYN_SENT; + return 0; } @@ -879,11 +932,14 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct pep_sock *pn = pep_sk(sk); int answ; + int ret = -ENOIOCTLCMD; switch (cmd) { case SIOCINQ: - if (sk->sk_state == TCP_LISTEN) - return -EINVAL; + if (sk->sk_state == TCP_LISTEN) { + ret = -EINVAL; + break; + } lock_sock(sk); if (sock_flag(sk, SOCK_URGINLINE) && @@ -894,10 +950,22 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) else answ = 0; release_sock(sk); - return put_user(answ, (int __user *)arg); + ret = put_user(answ, (int __user *)arg); + break; + + case SIOCPNENABLEPIPE: + lock_sock(sk); + if (sk->sk_state == TCP_SYN_SENT) + ret = -EBUSY; + else if (sk->sk_state == TCP_ESTABLISHED) + ret = -EISCONN; + else + ret = pep_sock_enable(sk, NULL, 0); + release_sock(sk); + break; } - return -ENOIOCTLCMD; + return ret; } static int pep_init(struct sock *sk) @@ -960,6 +1028,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, } goto out_norel; + case PNPIPE_HANDLE: + if ((sk->sk_state == TCP_CLOSE) && + (val >= 0) && (val < PN_PIPE_INVALID_HANDLE)) + pn->pipe_handle = val; + else + err = -EINVAL; + break; + + case PNPIPE_INITSTATE: + pn->init_enable = !!val; + break; + default: err = -ENOPROTOOPT; } @@ -995,6 +1075,10 @@ static int pep_getsockopt(struct sock *sk, int level, int optname, return -EINVAL; break; + case PNPIPE_INITSTATE: + val = pn->init_enable; + break; + default: return -ENOPROTOOPT; } diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 4cf6dc7910e4..ec753b3ae72a 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -9,7 +9,6 @@ config RDS config RDS_RDMA tristate "RDS over Infiniband and iWARP" - select LLIST depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS ---help--- Allow RDS to use Infiniband and iWARP as a transport. diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 43ea7de2fc8e..4cba13e46ffd 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -306,10 +306,9 @@ static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, td->data_len = len; if (len > 0) { - td->data = kmalloc(len, GFP_KERNEL); + td->data = kmemdup(xdr, len, GFP_KERNEL); if (!td->data) return -ENOMEM; - memcpy(td->data, xdr, len); len = (len + 3) & ~3; toklen -= len; xdr += len >> 2; @@ -401,10 +400,9 @@ static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, _debug("ticket len %u", len); if (len > 0) { - *_ticket = kmalloc(len, GFP_KERNEL); + *_ticket = kmemdup(xdr, len, GFP_KERNEL); if (!*_ticket) return -ENOMEM; - memcpy(*_ticket, xdr, len); len = (len + 3) & ~3; toklen -= len; xdr += len >> 2; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 7b582300d051..51ff19485e12 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -26,6 +26,8 @@ #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> +#include <net/flow_keys.h> + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> #endif @@ -66,134 +68,37 @@ static inline u32 addr_fold(void *addr) return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); } -static u32 flow_get_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - __be32 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - saddr), - 4, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - saddr.s6_addr32[3]), - 4, &hdata); - break; - } - - if (data) - return ntohl(*data); + if (flow->src) + return ntohl(flow->src); return addr_fold(skb->sk); } -static u32 flow_get_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - __be32 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - daddr), - 4, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - daddr.s6_addr32[3]), - 4, &hdata); - break; - } - - if (data) - return ntohl(*data); + if (flow->dst) + return ntohl(flow->dst); return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } -static u32 flow_get_proto(const struct sk_buff *skb, int nhoff) +static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - __u8 *data = NULL, hdata; - - switch (skb->protocol) { - case htons(ETH_P_IP): - data = skb_header_pointer(skb, - nhoff + offsetof(struct iphdr, - protocol), - 1, &hdata); - break; - case htons(ETH_P_IPV6): - data = skb_header_pointer(skb, - nhoff + offsetof(struct ipv6hdr, - nexthdr), - 1, &hdata); - break; - } - if (data) - return *data; - return 0; + return flow->ip_proto; } -/* helper function to get either src or dst port */ -static __be16 *flow_get_proto_common(const struct sk_buff *skb, int nhoff, - __be16 *_port, int dst) +static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - __be16 *port = NULL; - int poff; - - switch (skb->protocol) { - case htons(ETH_P_IP): { - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - break; - if (ip_is_fragment(iph)) - break; - poff = proto_ports_offset(iph->protocol); - if (poff >= 0) - port = skb_header_pointer(skb, - nhoff + iph->ihl * 4 + poff + dst, - sizeof(*_port), _port); - break; - } - case htons(ETH_P_IPV6): { - struct ipv6hdr *iph, _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (!iph) - break; - poff = proto_ports_offset(iph->nexthdr); - if (poff >= 0) - port = skb_header_pointer(skb, - nhoff + sizeof(*iph) + poff + dst, - sizeof(*_port), _port); - break; - } - } - - return port; -} - -static u32 flow_get_proto_src(const struct sk_buff *skb, int nhoff) -{ - __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 0); - - if (port) - return ntohs(*port); + if (flow->ports) + return ntohs(flow->port16[0]); return addr_fold(skb->sk); } -static u32 flow_get_proto_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 2); - - if (port) - return ntohs(*port); + if (flow->ports) + return ntohs(flow->port16[1]); return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; } @@ -239,7 +144,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) }) #endif -static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb->protocol) { case htons(ETH_P_IP): @@ -248,10 +153,10 @@ static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff) return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: - return flow_get_src(skb, nhoff); + return flow_get_src(skb, flow); } -static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { switch (skb->protocol) { case htons(ETH_P_IP): @@ -260,21 +165,21 @@ static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff) return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: - return flow_get_dst(skb, nhoff); + return flow_get_dst(skb, flow); } -static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: - return flow_get_proto_src(skb, nhoff); + return flow_get_proto_src(skb, flow); } -static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, int nhoff) +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: - return flow_get_proto_dst(skb, nhoff); + return flow_get_proto_dst(skb, flow); } static u32 flow_get_rtclassid(const struct sk_buff *skb) @@ -314,21 +219,19 @@ static u32 flow_get_rxhash(struct sk_buff *skb) return skb_get_rxhash(skb); } -static u32 flow_key_get(struct sk_buff *skb, int key) +static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) { - int nhoff = skb_network_offset(skb); - switch (key) { case FLOW_KEY_SRC: - return flow_get_src(skb, nhoff); + return flow_get_src(skb, flow); case FLOW_KEY_DST: - return flow_get_dst(skb, nhoff); + return flow_get_dst(skb, flow); case FLOW_KEY_PROTO: - return flow_get_proto(skb, nhoff); + return flow_get_proto(skb, flow); case FLOW_KEY_PROTO_SRC: - return flow_get_proto_src(skb, nhoff); + return flow_get_proto_src(skb, flow); case FLOW_KEY_PROTO_DST: - return flow_get_proto_dst(skb, nhoff); + return flow_get_proto_dst(skb, flow); case FLOW_KEY_IIF: return flow_get_iif(skb); case FLOW_KEY_PRIORITY: @@ -338,13 +241,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key) case FLOW_KEY_NFCT: return flow_get_nfct(skb); case FLOW_KEY_NFCT_SRC: - return flow_get_nfct_src(skb, nhoff); + return flow_get_nfct_src(skb, flow); case FLOW_KEY_NFCT_DST: - return flow_get_nfct_dst(skb, nhoff); + return flow_get_nfct_dst(skb, flow); case FLOW_KEY_NFCT_PROTO_SRC: - return flow_get_nfct_proto_src(skb, nhoff); + return flow_get_nfct_proto_src(skb, flow); case FLOW_KEY_NFCT_PROTO_DST: - return flow_get_nfct_proto_dst(skb, nhoff); + return flow_get_nfct_proto_dst(skb, flow); case FLOW_KEY_RTCLASSID: return flow_get_rtclassid(skb); case FLOW_KEY_SKUID: @@ -361,6 +264,16 @@ static u32 flow_key_get(struct sk_buff *skb, int key) } } +#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \ + (1 << FLOW_KEY_DST) | \ + (1 << FLOW_KEY_PROTO) | \ + (1 << FLOW_KEY_PROTO_SRC) | \ + (1 << FLOW_KEY_PROTO_DST) | \ + (1 << FLOW_KEY_NFCT_SRC) | \ + (1 << FLOW_KEY_NFCT_DST) | \ + (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ + (1 << FLOW_KEY_NFCT_PROTO_DST)) + static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -373,16 +286,19 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, list_for_each_entry(f, &head->filters, list) { u32 keys[f->nkeys]; + struct flow_keys flow_keys; if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; keymask = f->keymask; + if (keymask & FLOW_KEYS_NEEDED) + skb_flow_dissect(skb, &flow_keys); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; keymask &= ~(1 << key); - keys[n] = flow_key_get(skb, key); + keys[n] = flow_key_get(skb, key, &flow_keys); } if (f->mode == FLOW_MODE_HASH) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 3422b25df9e4..205d369a217c 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -19,10 +19,7 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/red.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/ipv6.h> -#include <net/ipv6.h> +#include <net/flow_keys.h> /* CHOKe stateless AQM for fair bandwidth allocation @@ -142,85 +139,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* - * Compare flow of two packets - * Returns true only if source and destination address and port match. - * false for special cases - */ -static bool choke_match_flow(struct sk_buff *skb1, - struct sk_buff *skb2) -{ - int off1, off2, poff; - const u32 *ports1, *ports2; - u8 ip_proto; - __u32 hash1; - - if (skb1->protocol != skb2->protocol) - return false; - - /* Use hash value as quick check - * Assumes that __skb_get_rxhash makes IP header and ports linear - */ - hash1 = skb_get_rxhash(skb1); - if (!hash1 || hash1 != skb_get_rxhash(skb2)) - return false; - - /* Probably match, but be sure to avoid hash collisions */ - off1 = skb_network_offset(skb1); - off2 = skb_network_offset(skb2); - - switch (skb1->protocol) { - case __constant_htons(ETH_P_IP): { - const struct iphdr *ip1, *ip2; - - ip1 = (const struct iphdr *) (skb1->data + off1); - ip2 = (const struct iphdr *) (skb2->data + off2); - - ip_proto = ip1->protocol; - if (ip_proto != ip2->protocol || - ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr) - return false; - - if (ip_is_fragment(ip1) | ip_is_fragment(ip2)) - ip_proto = 0; - off1 += ip1->ihl * 4; - off2 += ip2->ihl * 4; - break; - } - - case __constant_htons(ETH_P_IPV6): { - const struct ipv6hdr *ip1, *ip2; - - ip1 = (const struct ipv6hdr *) (skb1->data + off1); - ip2 = (const struct ipv6hdr *) (skb2->data + off2); - - ip_proto = ip1->nexthdr; - if (ip_proto != ip2->nexthdr || - ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) || - ipv6_addr_cmp(&ip1->daddr, &ip2->daddr)) - return false; - off1 += 40; - off2 += 40; - } - - default: /* Maybe compare MAC header here? */ - return false; - } - - poff = proto_ports_offset(ip_proto); - if (poff < 0) - return true; - - off1 += poff; - off2 += poff; - - ports1 = (__force u32 *)(skb1->data + off1); - ports2 = (__force u32 *)(skb2->data + off2); - return *ports1 == *ports2; -} - struct choke_skb_cb { - u16 classid; + u16 classid; + u8 keys_valid; + struct flow_keys keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -241,6 +163,32 @@ static u16 choke_get_classid(const struct sk_buff *skb) } /* + * Compare flow of two packets + * Returns true only if source and destination address and port match. + * false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, + struct sk_buff *skb2) +{ + if (skb1->protocol != skb2->protocol) + return false; + + if (!choke_skb_cb(skb1)->keys_valid) { + choke_skb_cb(skb1)->keys_valid = 1; + skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); + } + + if (!choke_skb_cb(skb2)->keys_valid) { + choke_skb_cb(skb2)->keys_valid = 1; + skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); + } + + return !memcmp(&choke_skb_cb(skb1)->keys, + &choke_skb_cb(skb2)->keys, + sizeof(struct flow_keys)); +} + +/* * Classify flow using either: * 1. pre-existing classification result in skb * 2. fast internal classification @@ -326,6 +274,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto other_drop; /* Packet was eaten by filter */ } + choke_skb_cb(skb)->keys_valid = 0; /* Compute average queue usage (see RED) */ p->qavg = red_calc_qavg(p, sch->q.qlen); if (red_is_idling(p)) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 69fca2798804..67fc573e013a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -60,7 +60,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) /* check the reason of requeuing without tx lock first */ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - if (!netif_tx_queue_frozen_or_stopped(txq)) { + if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else @@ -121,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, spin_unlock(root_lock); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_tx_queue_frozen_or_stopped(txq)) + if (!netif_xmit_frozen_or_stopped(txq)) ret = dev_hard_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); @@ -143,7 +143,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, ret = dev_requeue_skb(skb, q); } - if (ret && netif_tx_queue_frozen_or_stopped(txq)) + if (ret && netif_xmit_frozen_or_stopped(txq)) ret = 0; return ret; @@ -242,10 +242,11 @@ static void dev_watchdog(unsigned long arg) * old device drivers set dev->trans_start */ trans_start = txq->trans_start ? : dev->trans_start; - if (netif_tx_queue_stopped(txq) && + if (netif_xmit_stopped(txq) && time_after(jiffies, (trans_start + dev->watchdog_timeo))) { some_queue_timedout = 1; + txq->trans_timeout++; break; } } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index edc1950e0e77..49131d7a7446 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -107,7 +107,8 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch) /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ - if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) { + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { @@ -138,7 +139,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch) /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ - if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) { + if (!netif_xmit_stopped( + netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eb3b9a86c6ed..3bfd73344f76 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -79,6 +79,7 @@ struct netem_sched_data { u32 duplicate; u32 reorder; u32 corrupt; + u32 rate; struct crndstate { u32 last; @@ -298,6 +299,14 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } +static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate) +{ + u64 ticks = (u64)len * NSEC_PER_SEC; + + do_div(ticks, rate); + return PSCHED_NS2TICKS(ticks); +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -371,6 +380,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) &q->delay_cor, q->delay_dist); now = psched_get_time(); + + if (q->rate) { + struct sk_buff_head *list = &q->qdisc->q; + + delay += packet_len_2_sched_time(skb->len, q->rate); + + if (!skb_queue_empty(list)) { + /* + * Last packet in queue is reference point (now). + * First packet in queue is already in flight, + * calculate this time bonus and substract + * from delay. + */ + delay -= now - netem_skb_cb(skb_peek(list))->time_to_send; + now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + } + } + cb->time_to_send = now + delay; ++q->counter; ret = qdisc_enqueue(skb, q->qdisc); @@ -535,6 +562,14 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->corrupt_cor, r->correlation); } +static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_rate *r = nla_data(attr); + + q->rate = r->rate; +} + static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -594,6 +629,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, }; @@ -666,6 +702,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_CORRUPT]) get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + if (tb[TCA_NETEM_RATE]) + get_rate(sch, tb[TCA_NETEM_RATE]); + q->loss_model = CLG_RANDOM; if (tb[TCA_NETEM_LOSS]) ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); @@ -846,6 +885,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; + struct tc_netem_rate rate; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -868,6 +908,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) corrupt.correlation = q->corrupt_cor.rho; NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rate.rate = q->rate; + NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); + if (dump_loss_model(q, skb) != 0) goto nla_put_failure; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6649463da1b6..d617161f8dd3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -209,8 +209,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) ctl->Plog, ctl->Scell_log, nla_data(tb[TCA_RED_STAB])); - if (skb_queue_empty(&sch->q)) - red_end_of_idle_period(&q->parms); + if (!q->qdisc->q.qlen) + red_start_of_idle_period(&q->parms); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index e83c272c0325..96e42cae4c7a 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -26,6 +26,7 @@ #include <net/ip.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> +#include <net/flow_keys.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -286,6 +287,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) u32 minqlen = ~0; u32 r, slot, salt, sfbhash; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { sch->qstats.overlimits++; @@ -309,13 +311,19 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, q, &ret, &salt)) goto other_drop; + keys.src = salt; + keys.dst = 0; + keys.ports = 0; } else { - salt = skb_get_rxhash(skb); + skb_flow_dissect(skb, &keys); } slot = q->slot; - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -347,7 +355,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + sfbhash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 4f5510e2bd6f..30cda707e400 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -17,14 +17,13 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> -#include <linux/ipv6.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <net/ip.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/flow_keys.h> /* Stochastic Fairness Queuing algorithm. @@ -137,61 +136,17 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_SLOTS]; } -static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +static unsigned int sfq_hash(const struct sfq_sched_data *q, + const struct sk_buff *skb) { - return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); -} - -static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) -{ - u32 h, h2; - - switch (skb->protocol) { - case htons(ETH_P_IP): - { - const struct iphdr *iph; - int poff; - - if (!pskb_network_may_pull(skb, sizeof(*iph))) - goto err; - iph = ip_hdr(skb); - h = (__force u32)iph->daddr; - h2 = (__force u32)iph->saddr ^ iph->protocol; - if (ip_is_fragment(iph)) - break; - poff = proto_ports_offset(iph->protocol); - if (poff >= 0 && - pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { - iph = ip_hdr(skb); - h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff); - } - break; - } - case htons(ETH_P_IPV6): - { - const struct ipv6hdr *iph; - int poff; - - if (!pskb_network_may_pull(skb, sizeof(*iph))) - goto err; - iph = ipv6_hdr(skb); - h = (__force u32)iph->daddr.s6_addr32[3]; - h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; - poff = proto_ports_offset(iph->nexthdr); - if (poff >= 0 && - pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { - iph = ipv6_hdr(skb); - h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff); - } - break; - } - default: -err: - h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol; - h2 = (unsigned long)skb->sk; - } + struct flow_keys keys; + unsigned int hash; - return sfq_fold_hash(q, h, h2); + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return hash & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index a3b7120fcc74..45326599fda3 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -225,11 +225,11 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) static int -__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, + struct net_device *dev, struct netdev_queue *txq, + struct neighbour *mn) { - struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0); - struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc); - struct neighbour *mn = dst_get_neighbour(skb_dst(skb)); + struct teql_sched_data *q = qdisc_priv(txq->qdisc); struct neighbour *n = q->ncache; if (mn->tbl == NULL) @@ -262,17 +262,26 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device * } static inline int teql_resolve(struct sk_buff *skb, - struct sk_buff *skb_res, struct net_device *dev) + struct sk_buff *skb_res, + struct net_device *dev, + struct netdev_queue *txq) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct dst_entry *dst = skb_dst(skb); + struct neighbour *mn; + int res; + if (txq->qdisc == &noop_qdisc) return -ENODEV; - if (dev->header_ops == NULL || - skb_dst(skb) == NULL || - dst_get_neighbour(skb_dst(skb)) == NULL) + if (!dev->header_ops || !dst) return 0; - return __teql_resolve(skb, skb_res, dev); + + rcu_read_lock(); + mn = dst_get_neighbour_noref(dst); + res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0; + rcu_read_unlock(); + + return res; } static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev) @@ -301,18 +310,18 @@ restart: if (slave_txq->qdisc_sleeping != q) continue; - if (__netif_subqueue_stopped(slave, subq) || + if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) || !netif_running(slave)) { busy = 1; continue; } - switch (teql_resolve(skb, skb_res, slave)) { + switch (teql_resolve(skb, skb_res, slave, slave_txq)) { case 0: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); - if (!netif_tx_queue_frozen_or_stopped(slave_txq) && + if (!netif_xmit_frozen_or_stopped(slave_txq) && slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); @@ -324,7 +333,7 @@ restart: } __netif_tx_unlock(slave_txq); } - if (netif_queue_stopped(dev)) + if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0))) busy = 1; break; case 1: diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 865e68fef21c..bf812048cf6f 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -82,7 +82,7 @@ static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp) struct sctp_auth_bytes *key; /* Verify that we are not going to overflow INT_MAX */ - if ((INT_MAX - key_len) < sizeof(struct sctp_auth_bytes)) + if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes))) return NULL; /* Allocate the shared key */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 810427833bcd..91f479121c55 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -107,7 +107,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_port = 0; - ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifa->addr); + addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&sctp_local_addr_lock); @@ -219,8 +219,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) /* Fill in the dest address from the route entry passed with the skb * and the source address from the transport. */ - ipv6_addr_copy(&fl6.daddr, &transport->ipaddr.v6.sin6_addr); - ipv6_addr_copy(&fl6.saddr, &transport->saddr.v6.sin6_addr); + fl6.daddr = transport->ipaddr.v6.sin6_addr; + fl6.saddr = transport->saddr.v6.sin6_addr; fl6.flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl6.flowlabel); @@ -231,7 +231,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&fl6.daddr, rt0->addr); + fl6.daddr = *rt0->addr; } SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", @@ -265,7 +265,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, sctp_scope_t scope; memset(fl6, 0, sizeof(struct flowi6)); - ipv6_addr_copy(&fl6->daddr, &daddr->v6.sin6_addr); + fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) @@ -277,7 +277,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->fl6_sport = htons(asoc->base.bind_addr.port); if (saddr) { - ipv6_addr_copy(&fl6->saddr, &saddr->v6.sin6_addr); + fl6->saddr = saddr->v6.sin6_addr; fl6->fl6_sport = saddr->v6.sin6_port; SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl6->saddr); } @@ -334,7 +334,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } rcu_read_unlock(); if (baddr) { - ipv6_addr_copy(&fl6->saddr, &baddr->v6.sin6_addr); + fl6->saddr = baddr->v6.sin6_addr; fl6->fl6_sport = baddr->v6.sin6_port; dst = ip6_dst_lookup_flow(sk, fl6, NULL, false); } @@ -375,7 +375,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk, if (t->dst) { saddr->v6.sin6_family = AF_INET6; - ipv6_addr_copy(&saddr->v6.sin6_addr, &fl6->saddr); + saddr->v6.sin6_addr = fl6->saddr; } } @@ -400,7 +400,7 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_port = 0; - ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifp->addr); + addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; INIT_LIST_HEAD(&addr->list); @@ -416,7 +416,6 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, int is_saddr) { - void *from; __be16 *port; struct sctphdr *sh; @@ -428,12 +427,11 @@ static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; - from = &ipv6_hdr(skb)->saddr; + addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; } else { *port = sh->dest; - from = &ipv6_hdr(skb)->daddr; + addr->v6.sin6_addr = ipv6_hdr(skb)->daddr; } - ipv6_addr_copy(&addr->v6.sin6_addr, from); } /* Initialize an sctp_addr from a socket. */ @@ -441,7 +439,7 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; - ipv6_addr_copy(&addr->v6.sin6_addr, &inet6_sk(sk)->rcv_saddr); + addr->v6.sin6_addr = inet6_sk(sk)->rcv_saddr; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ @@ -454,7 +452,7 @@ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) inet6_sk(sk)->rcv_saddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { - ipv6_addr_copy(&inet6_sk(sk)->rcv_saddr, &addr->v6.sin6_addr); + inet6_sk(sk)->rcv_saddr = addr->v6.sin6_addr; } } @@ -467,7 +465,7 @@ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) inet6_sk(sk)->daddr.s6_addr32[2] = htonl(0x0000ffff); inet6_sk(sk)->daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { - ipv6_addr_copy(&inet6_sk(sk)->daddr, &addr->v6.sin6_addr); + inet6_sk(sk)->daddr = addr->v6.sin6_addr; } } @@ -479,7 +477,7 @@ static void sctp_v6_from_addr_param(union sctp_addr *addr, addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ - ipv6_addr_copy(&addr->v6.sin6_addr, ¶m->v6.addr); + addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; } @@ -493,7 +491,7 @@ static int sctp_v6_to_addr_param(const union sctp_addr *addr, param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); - ipv6_addr_copy(¶m->v6.addr, &addr->v6.sin6_addr); + param->v6.addr = addr->v6.sin6_addr; return length; } @@ -504,7 +502,7 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; - ipv6_addr_copy(&addr->v6.sin6_addr, saddr); + addr->v6.sin6_addr = *saddr; } /* Compare addresses exactly. @@ -759,7 +757,7 @@ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, } sin6from = &asoc->peer.primary_addr.v6; - ipv6_addr_copy(&sin6->sin6_addr, &sin6from->sin6_addr); + sin6->sin6_addr = sin6from->sin6_addr; if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6->sin6_scope_id = sin6from->sin6_scope_id; } @@ -787,7 +785,7 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, } /* Otherwise, just copy the v6 address. */ - ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + sin6->sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { struct sctp_ulpevent *ev = sctp_skb2event(skb); sin6->sin6_scope_id = ev->iif; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 0121e0ab0351..a85eeeb55dd0 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3400,8 +3400,10 @@ int sctp_process_asconf_ack(struct sctp_association *asoc, asconf_len -= length; } - if (no_err && asoc->src_out_of_asoc_ok) + if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; + sctp_transport_immediate_rtx(asoc->peer.primary_path); + } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 76388b083f28..1ff51c9d18d5 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -666,6 +666,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, struct sctp_chunk *chunk) { sctp_sender_hb_info_t *hbinfo; + int was_unconfirmed = 0; /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the * HEARTBEAT should clear the error counter of the destination @@ -692,9 +693,11 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, /* Mark the destination transport address as active if it is not so * marked. */ - if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) + if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) { + was_unconfirmed = 1; sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); + } /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address @@ -712,6 +715,9 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, /* Update the heartbeat timer. */ if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) sctp_transport_hold(t); + + if (was_unconfirmed && asoc->peer.transport_count == 1) + sctp_transport_immediate_rtx(t); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13bf5fcdbff1..d56c07a3d435 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -804,7 +804,7 @@ static int sctp_send_asconf_del_ip(struct sock *sk, struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addrs; - ipv6_addr_copy(&asoc->asconf_addr_del_pending->v6.sin6_addr, &sin6->sin6_addr); + asoc->asconf_addr_del_pending->v6.sin6_addr = sin6->sin6_addr; } SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: keep the last address asoc: %p ", " at %p\n", asoc, asoc->asconf_addr_del_pending, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 394c57ca2f54..3889330b7b04 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -641,3 +641,19 @@ void sctp_transport_reset(struct sctp_transport *t) t->cacc.next_tsn_at_change = 0; t->cacc.cacc_saw_newack = 0; } + +/* Schedule retransmission on the given transport */ +void sctp_transport_immediate_rtx(struct sctp_transport *t) +{ + /* Stop pending T3_rtx_timer */ + if (timer_pending(&t->T3_rtx_timer)) { + (void)del_timer(&t->T3_rtx_timer); + sctp_transport_put(t); + } + sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); + if (!timer_pending(&t->T3_rtx_timer)) { + if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto)) + sctp_transport_hold(t); + } + return; +} diff --git a/net/socket.c b/net/socket.c index 425ef4270460..e62b4f055071 100644 --- a/net/socket.c +++ b/net/socket.c @@ -551,6 +551,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, sock_update_classid(sock->sk); + sock_update_netprioidx(sock->sk); + si->sock = sock; si->scm = NULL; si->msg = msg; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index ce136323da8b..fe258fc37f50 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -134,7 +134,7 @@ static void ip_map_init(struct cache_head *cnew, struct cache_head *citem) struct ip_map *item = container_of(citem, struct ip_map, h); strcpy(new->m_class, item->m_class); - ipv6_addr_copy(&new->m_addr, &item->m_addr); + new->m_addr = item->m_addr; } static void update(struct cache_head *cnew, struct cache_head *citem) { @@ -274,7 +274,7 @@ static int ip_map_show(struct seq_file *m, } im = container_of(h, struct ip_map, h); /* class addr domain */ - ipv6_addr_copy(&addr, &im->m_addr); + addr = im->m_addr; if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) @@ -297,7 +297,7 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct cache_head *ch; strcpy(ip.m_class, class); - ipv6_addr_copy(&ip.m_addr, addr); + ip.m_addr = *addr; ch = sunrpc_cache_lookup(cd, &ip.h, hash_str(class, IP_HASHBITS) ^ hash_ip6(*addr)); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 71bed1c1c77a..4653286fcc9e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -157,7 +157,7 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) cmh->cmsg_level = SOL_IPV6; cmh->cmsg_type = IPV6_PKTINFO; pki->ipi6_ifindex = daddr->sin6_scope_id; - ipv6_addr_copy(&pki->ipi6_addr, &daddr->sin6_addr); + pki->ipi6_addr = daddr->sin6_addr; cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); } break; @@ -523,7 +523,7 @@ static int svc_udp_get_dest_address6(struct svc_rqst *rqstp, return 0; daddr->sin6_family = AF_INET6; - ipv6_addr_copy(&daddr->sin6_addr, &pki->ipi6_addr); + daddr->sin6_addr = pki->ipi6_addr; daddr->sin6_scope_id = pki->ipi6_ifindex; return 1; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d7f97ef26590..55472c48825e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -496,7 +496,7 @@ static int xs_nospace(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - int ret = 0; + int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", task->tk_pid, req->rq_slen - req->rq_bytes_sent, @@ -508,7 +508,6 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { - ret = -EAGAIN; /* * Notify TCP that we're limited by the application * window size @@ -2530,8 +2529,10 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, int err; err = xs_init_anyaddr(args->dstaddr->sa_family, (struct sockaddr *)&new->srcaddr); - if (err != 0) + if (err != 0) { + xprt_free(xprt); return ERR_PTR(err); + } } return xprt; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 466fbcc5cf77..b595a3d8679f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1957,6 +1957,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if ((UNIXCB(skb).pid != siocb->scm->pid) || (UNIXCB(skb).cred != siocb->scm->cred)) { skb_queue_head(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); break; } } else { @@ -1974,6 +1975,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, chunk = min_t(unsigned int, skb->len, size); if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); if (copied == 0) copied = -EFAULT; break; @@ -1991,6 +1993,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, /* put the skb back if we didn't use it up.. */ if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); break; } @@ -2006,6 +2009,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); break; } } while (size); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 552df27dcf53..82e803b56952 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -61,8 +61,8 @@ __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; - return addr_match(&fl4->daddr, &sel->daddr, sel->prefixlen_d) && - addr_match(&fl4->saddr, &sel->saddr, sel->prefixlen_s) && + return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && + addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && @@ -1499,7 +1499,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto free_dst; /* Copy neighbour for reachability confirmation */ - dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour(dst))); + dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour_noref(dst))); xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); xfrm_init_pmtu(dst_prev); @@ -2382,9 +2382,11 @@ static unsigned int xfrm_default_advmss(const struct dst_entry *dst) return dst_metric_advmss(dst->path); } -static unsigned int xfrm_default_mtu(const struct dst_entry *dst) +static unsigned int xfrm_mtu(const struct dst_entry *dst) { - return dst_mtu(dst->path); + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst_mtu(dst->path); } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, const void *daddr) @@ -2411,8 +2413,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; - if (likely(dst_ops->default_mtu == NULL)) - dst_ops->default_mtu = xfrm_default_mtu; + if (likely(dst_ops->mtu == NULL)) + dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9414b9c5b1e4..5b228f97d4b3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1035,16 +1035,12 @@ static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, break; case AF_INET6: - ipv6_addr_copy((struct in6_addr *)x->sel.daddr.a6, - (const struct in6_addr *)daddr); - ipv6_addr_copy((struct in6_addr *)x->sel.saddr.a6, - (const struct in6_addr *)saddr); + *(struct in6_addr *)x->sel.daddr.a6 = *(struct in6_addr *)daddr; + *(struct in6_addr *)x->sel.saddr.a6 = *(struct in6_addr *)saddr; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; - ipv6_addr_copy((struct in6_addr *)x->props.saddr.a6, - (const struct in6_addr *)saddr); - ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6, - (const struct in6_addr *)daddr); + *(struct in6_addr *)x->props.saddr.a6 = *(struct in6_addr *)saddr; + *(struct in6_addr *)x->id.daddr.a6 = *(struct in6_addr *)daddr; break; } |
