diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 17 | ||||
-rw-r--r-- | net/core/datagram.c | 482 | ||||
-rw-r--r-- | net/core/dev.c | 3359 | ||||
-rw-r--r-- | net/core/dev_mcast.c | 299 | ||||
-rw-r--r-- | net/core/dst.c | 276 | ||||
-rw-r--r-- | net/core/dv.c | 548 | ||||
-rw-r--r-- | net/core/ethtool.c | 819 | ||||
-rw-r--r-- | net/core/filter.c | 432 | ||||
-rw-r--r-- | net/core/flow.c | 371 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 250 | ||||
-rw-r--r-- | net/core/gen_stats.c | 239 | ||||
-rw-r--r-- | net/core/iovec.c | 239 | ||||
-rw-r--r-- | net/core/link_watch.c | 137 | ||||
-rw-r--r-- | net/core/neighbour.c | 2362 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 461 | ||||
-rw-r--r-- | net/core/netfilter.c | 799 | ||||
-rw-r--r-- | net/core/netpoll.c | 735 | ||||
-rw-r--r-- | net/core/pktgen.c | 3132 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 711 | ||||
-rw-r--r-- | net/core/scm.c | 291 | ||||
-rw-r--r-- | net/core/skbuff.c | 1460 | ||||
-rw-r--r-- | net/core/sock.c | 1565 | ||||
-rw-r--r-- | net/core/stream.c | 287 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 182 | ||||
-rw-r--r-- | net/core/utils.c | 155 | ||||
-rw-r--r-- | net/core/wireless.c | 1459 |
26 files changed, 21067 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 000000000000..81f03243fe2f --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o + +obj-$(CONFIG_SYSFS) += net-sysfs.o +obj-$(CONFIG_NETFILTER) += netfilter.o +obj-$(CONFIG_NET_DIVERT) += dv.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NET_RADIO) += wireless.o +obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 000000000000..d1bfd279cc1a --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,482 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all + * protocols. Possibly a generic IP version on top of these would + * make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + * NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was + * shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old + * udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() + * understood + * Alan Cox : Rewrote skb_read_datagram to avoid the + * skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. + * IPX can no longer use the SO_TYPE hack + * but AX.25 now works right, and SPX is + * feasible. + * Alan Cox : Fixed write poll of non IP protocol + * crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * Pete Wyckoff : Unconnected accept() fix. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/inet.h> +#include <linux/tcp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/poll.h> +#include <linux/highmem.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/checksum.h> + + +/* + * Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ + return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +/* + * Wait for a packet.. + */ +static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto out; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. + * If so we report the problem + */ + error = -ENOTCONN; + if (connection_based(sk) && + !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + error = 0; + *timeo_p = schedule_timeout(*timeo_p); +out: + finish_wait(sk->sk_sleep, &wait); + return error; +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; + goto out; +out_noerr: + *err = 0; + error = 1; + goto out; +} + +/** + * skb_recv_datagram - Receive a datagram skbuff + * @sk - socket + * @flags - MSG_ flags + * @noblock - blocking operation? + * @err - error code returned + * + * Get a datagram skbuff, understands the peeking, nonblocking wakeups + * and possible races. This replaces identical code in packet, raw and + * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + * the long standing peek and read race for datagram sockets. If you + * alter this routine remember it must be re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling + * skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, + int noblock, int *err) +{ + struct sk_buff *skb; + long timeo; + /* + * Caller is allowed not to check sk->sk_err before skb_recv_datagram() + */ + int error = sock_error(sk); + + if (error) + goto no_packet; + + timeo = sock_rcvtimeo(sk, noblock); + + do { + /* Again only user level code calls this function, so nothing + * interrupt level will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was corrent in any case. 8) + */ + if (flags & MSG_PEEK) { + unsigned long cpu_flags; + + spin_lock_irqsave(&sk->sk_receive_queue.lock, + cpu_flags); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + atomic_inc(&skb->users); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, + cpu_flags); + } else + skb = skb_dequeue(&sk->sk_receive_queue); + + if (skb) + return skb; + + /* User doesn't want to wait */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + + } while (!wait_for_packet(sk, err, &timeo)); + + return NULL; + +no_packet: + *err = error; + return NULL; +} + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/** + * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * + * Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, + struct iovec *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovec(to, skb->data + offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovec(to, vaddr + frag->page_offset + + offset - start, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iovec(list, + offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + u8 __user *to, int len, + unsigned int *csump) +{ + int start = skb_headlen(skb); + int pos = 0; + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + int err = 0; + if (copy > len) + copy = len; + *csump = csum_and_copy_to_user(skb->data + offset, to, copy, + *csump, &err); + if (err) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + int err = 0; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + csum2 = csum_and_copy_to_user(vaddr + + frag->page_offset + + offset - start, + to, copy, 0, &err); + kunmap(page); + if (err) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if (!(len -= copy)) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list=list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + unsigned int csum2 = 0; + if (copy > len) + copy = len; + if (skb_copy_and_csum_datagram(list, + offset - start, + to, copy, + &csum2)) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +/** + * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * @skb - skbuff + * @hlen - hardware length + * @iovec - io vector + * + * Caller _must_ check that skb will fit to this iovec. + * + * Returns: 0 - success. + * -EINVAL - checksum failure. + * -EFAULT - fault during copy. Beware, in this case iovec + * can be modified! + */ +int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, + int hlen, struct iovec *iov) +{ + unsigned int csum; + int chunk = skb->len - hlen; + + /* Skip filled elements. + * Pretty silly, look at memcpy_toiovec, though 8) + */ + while (!iov->iov_len) + iov++; + + if (iov->iov_len < chunk) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, + skb->csum))) + goto csum_error; + if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + goto fault; + } else { + csum = csum_partial(skb->data, hlen, skb->csum); + if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + chunk, &csum)) + goto fault; + if ((unsigned short)csum_fold(csum)) + goto csum_error; + iov->iov_len -= chunk; + iov->iov_base += chunk; + } + return 0; +csum_error: + return -EINVAL; +fault: + return -EFAULT; +} + +/** + * datagram_poll - generic datagram poll + * @file - file struct + * @sock - socket + * @wait - poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ +unsigned int datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->sk_state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} + +EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iovec); +EXPORT_SYMBOL(skb_free_datagram); +EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 000000000000..42344d903692 --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,3359 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Florian la Roche <rzsfl@rz.uni-sb.de> + * Alan Cox <gw4pts@gw4pts.ampr.org> + * David Hinds <dahinds@users.sourceforge.net> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Adam Sulmicki <adam@cfar.umd.edu> + * Pekka Riikonen <priikone@poesidon.pspt.fi> + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/if_bridge.h> +#include <linux/divert.h> +#include <net/dst.h> +#include <net/pkt_sched.h> +#include <net/checksum.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/netpoll.h> +#include <linux/rcupdate.h> +#include <linux/delay.h> +#ifdef CONFIG_NET_RADIO +#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ +#include <net/iw_handler.h> +#endif /* CONFIG_NET_RADIO */ +#include <asm/current.h> + +/* This define, if set, will randomly drop a packet when congestion + * is more than moderate. It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the + * single interface case so it is off now by default. + */ +#undef RAND_LIE + +/* Setting this will sample the queue lengths and thus congestion + * via a timer instead of as each packet is received. + */ +#undef OFFLINE_SAMPLE + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * NOTE: That is no longer true with the addition of VLAN tags. Not + * sure which should go first, but I bet it won't make much + * difference if we are running VLANs. The good news is that + * this protocol won't be in the list unless compiled in, so + * the average user (w/out VLANs) will not be adversly affected. + * --BLG + * + * 0800 IP + * 8100 802.1Q VLAN + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[16]; /* 16 way hashed list */ +static struct list_head ptype_all; /* Taps */ + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy); +static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); +#endif + +/* + * The @dev_base list is protected by @dev_base_lock and the rtln + * semaphore. + * + * Pure readers hold dev_base_lock for reading. + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +struct net_device *dev_base; +static struct net_device **dev_tail = &dev_base; +DEFINE_RWLOCK(dev_base_lock); + +EXPORT_SYMBOL(dev_base); +EXPORT_SYMBOL(dev_base_lock); + +#define NETDEV_HASHBITS 8 +static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; +static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; + +static inline struct hlist_head *dev_name_hash(const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; +} + +static inline struct hlist_head *dev_index_hash(int ifindex) +{ + return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; +} + +/* + * Our notifier list + */ + +static struct notifier_block *netdev_chain; + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the local softnet handler. + */ +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; + +#ifdef CONFIG_SYSFS +extern int netdev_sysfs_init(void); +extern int netdev_register_sysfs(struct net_device *); +extern void netdev_unregister_sysfs(struct net_device *); +#else +#define netdev_sysfs_init() (0) +#define netdev_register_sysfs(dev) (0) +#define netdev_unregister_sysfs(dev) do { } while(0) +#endif + + +/******************************************************************************* + + Protocol management and registration routines + +*******************************************************************************/ + +/* + * For efficiency + */ + +int netdev_nit; + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscuous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explanation follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed &packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ + int hash; + + spin_lock_bh(&ptype_lock); + if (pt->type == htons(ETH_P_ALL)) { + netdev_nit++; + list_add_rcu(&pt->list, &ptype_all); + } else { + hash = ntohs(pt->type) & 15; + list_add_rcu(&pt->list, &ptype_base[hash]); + } + spin_unlock_bh(&ptype_lock); +} + +extern void linkwatch_run_queue(void); + + + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head; + struct packet_type *pt1; + + spin_lock_bh(&ptype_lock); + + if (pt->type == htons(ETH_P_ALL)) { + netdev_nit--; + head = &ptype_all; + } else + head = &ptype_base[ntohs(pt->type) & 15]; + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +out: + spin_unlock_bh(&ptype_lock); +} +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} + +/****************************************************************************** + + Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + * netdev_boot_setup_add - add new setup entry + * @name: name of the device + * @map: configured settings for the device + * + * Adds new setup entry to the dev_boot_setup list. The function + * returns 0 on error and 1 on success. This is a generic routine to + * all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ + struct netdev_boot_setup *s; + int i; + + s = dev_boot_setup; + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { + memset(s[i].name, 0, sizeof(s[i].name)); + strcpy(s[i].name, name); + memcpy(&s[i].map, map, sizeof(s[i].map)); + break; + } + } + + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice + * + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ + struct netdev_boot_setup *s = dev_boot_setup; + int i; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && + !strncmp(dev->name, s[i].name, strlen(s[i].name))) { + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; + return 1; + } + } + return 0; +} + + +/** + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device + * + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ + const struct netdev_boot_setup *s = dev_boot_setup; + char name[IFNAMSIZ]; + int i; + + sprintf(name, "%s%d", prefix, unit); + + /* + * If device already registered then return base of 1 + * to indicate not to probe for this interface + */ + if (__dev_get_by_name(name)) + return 1; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) + if (!strcmp(name, s[i].name)) + return s[i].map.base_addr; + return 0; +} + +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ + int ints[5]; + struct ifmap map; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (!str || !*str) + return 0; + + /* Save settings */ + memset(&map, 0, sizeof(map)); + if (ints[0] > 0) + map.irq = ints[1]; + if (ints[0] > 1) + map.base_addr = ints[2]; + if (ints[0] > 2) + map.mem_start = ints[3]; + if (ints[0] > 3) + map.mem_end = ints[4]; + + /* Add new entry to the list */ + return netdev_boot_setup_add(str, &map); +} + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + + Device Interface Subroutines + +*******************************************************************************/ + +/** + * __dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(const char *name) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_name_hash(name)) { + struct net_device *dev + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + } + return NULL; +} + +/** + * dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * __dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(int ifindex) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_index_hash(ifindex)) { + struct net_device *dev + = hlist_entry(p, struct net_device, index_hlist); + if (dev->ifindex == ifindex) + return dev; + } + return NULL; +} + + +/** + * dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(int ifindex) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifindex); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * dev_getbyhwaddr - find a device by its hardware address + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. The caller must hold the + * rtnl semaphore. The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + * BUGS: + * If the API was consistent this would be __dev_get_by_hwaddr + */ + +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + + for (dev = dev_base; dev; dev = dev->next) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + break; + return dev; +} + +struct net_device *dev_getfirstbyhwtype(unsigned short type) +{ + struct net_device *dev; + + rtnl_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if (dev->type == type) { + dev_hold(dev); + break; + } + } + rtnl_unlock(); + return dev; +} + +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * dev_get_by_flags - find any device with given flags + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if (((dev->flags ^ if_flags) & mask) == 0) { + dev_hold(dev); + break; + } + } + read_unlock(&dev_base_lock); + return dev; +} + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * to allow sysfs to work + */ +static int dev_valid_name(const char *name) +{ + return !(*name == '\0' + || !strcmp(name, ".") + || !strcmp(name, "..") + || strchr(name, '/')); +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. Not efficient for many devices, not called a lot. The caller + * must hold the dev_base or rtnl lock while allocating the name and + * adding the device in order to avoid duplicates. Returns the number + * of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + int i = 0; + char buf[IFNAMSIZ]; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + long *inuse; + struct net_device *d; + + p = strnchr(name, IFNAMSIZ-1, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for (d = dev_base; d; d = d->next) { + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, sizeof(buf), name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + snprintf(buf, sizeof(buf), name, i); + if (!__dev_get_by_name(buf)) { + strlcpy(dev->name, buf, IFNAMSIZ); + return i; + } + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, char *newname) +{ + int err = 0; + + ASSERT_RTNL(); + + if (dev->flags & IFF_UP) + return -EBUSY; + + if (!dev_valid_name(newname)) + return -EINVAL; + + if (strchr(newname, '%')) { + err = dev_alloc_name(dev, newname); + if (err < 0) + return err; + strcpy(newname, dev->name); + } + else if (__dev_get_by_name(newname)) + return -EEXIST; + else + strlcpy(dev->name, newname, IFNAMSIZ); + + err = class_device_rename(&dev->class_dev, dev->name); + if (!err) { + hlist_del(&dev->name_hlist); + hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + } + + return err; +} + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + } +} + +/** + * dev_load - load a network module + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + read_unlock(&dev_base_lock); + + if (!dev && capable(CAP_SYS_MODULE)) + request_module("%s", name); +} + +static int default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", + skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); + return 1; +} + + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ + int ret = 0; + + /* + * Is it already up? + */ + + if (dev->flags & IFF_UP) + return 0; + + /* + * Is it even present? + */ + if (!netif_device_present(dev)) + return -ENODEV; + + /* + * Call device private open method + */ + set_bit(__LINK_STATE_START, &dev->state); + if (dev->open) { + ret = dev->open(dev); + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + } + + /* + * If it went open OK then: + */ + + if (!ret) { + /* + * Set the flags. + */ + dev->flags |= IFF_UP; + + /* + * Initialize multicasting status + */ + dev_mc_upload(dev); + + /* + * Wakeup transmit queue engine + */ + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + } + return ret; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (!(dev->flags & IFF_UP)) + return 0; + + /* + * Tell people we are going down, so that they can + * prepare to death, when device is still operating. + */ + notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); + + dev_deactivate(dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, + * it can be even on different cpu. So just clear netif_running(), + * and wait when poll really will happen. Actually, the best place + * for this is inside dev->stop() after device stopped its irq + * engine, but this requires more changes in devices. */ + + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { + /* No hurry. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (dev->stop) + dev->stop(dev); + + /* + * Device is now down. + */ + + dev->flags &= ~IFF_UP; + + /* + * Tell people we are down + */ + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + + return 0; +} + + +/* + * Device change register/unregister. These are not inline or static + * as we export them to the world. + */ + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + int err; + + rtnl_lock(); + err = notifier_chain_register(&netdev_chain, nb); + if (!err) { + for (dev = dev_base; dev; dev = dev->next) { + nb->notifier_call(nb, NETDEV_REGISTER, dev); + + if (dev->flags & IFF_UP) + nb->notifier_call(nb, NETDEV_UP, dev); + } + } + rtnl_unlock(); + return err; +} + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&netdev_chain, nb); +} + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, void *v) +{ + return notifier_call_chain(&netdev_chain, val, v); +} + +/* When > 0 there are consumers of rx skb time stamps */ +static atomic_t netstamp_needed = ATOMIC_INIT(0); + +void net_enable_timestamp(void) +{ + atomic_inc(&netstamp_needed); +} + +void net_disable_timestamp(void) +{ + atomic_dec(&netstamp_needed); +} + +static inline void net_timestamp(struct timeval *stamp) +{ + if (atomic_read(&netstamp_needed)) + do_gettimeofday(stamp); + else { + stamp->tv_sec = 0; + stamp->tv_usec = 0; + } +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + net_timestamp(&skb->stamp); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { + struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + + /* skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; + + if (skb2->nh.raw < skb2->data || + skb2->nh.raw > skb2->tail) { + if (net_ratelimit()) + printk(KERN_CRIT "protocol %04x is " + "buggy, dev %s\n", + skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + } + + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); + } + } + rcu_read_unlock(); +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb, int inward) +{ + unsigned int csum; + int ret = 0, offset = skb->h.raw - skb->data; + + if (inward) { + skb->ip_summed = CHECKSUM_NONE; + goto out; + } + + if (skb_cloned(skb)) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + + if (offset > (int)skb->len) + BUG(); + csum = skb_checksum(skb, offset, skb->len-offset, 0); + + offset = skb->tail - skb->h.raw; + if (offset <= 0) + BUG(); + if (skb->csum + 2 > offset) + BUG(); + + *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} + +#ifdef CONFIG_HIGHMEM +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ + int i; + + if (dev->features & NETIF_F_HIGHDMA) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + + return 0; +} +#else +#define illegal_highdma(dev, skb) (0) +#endif + +extern void skb_release_data(struct sk_buff *); + +/* Keep head the same: replace data */ +int __skb_linearize(struct sk_buff *skb, int gfp_mask) +{ + unsigned int size; + u8 *data; + long offset; + struct skb_shared_info *ninfo; + int headerlen = skb->data - skb->head; + int expand = (skb->tail + skb->data_len) - skb->end; + + if (skb_shared(skb)) + BUG(); + + if (expand <= 0) + expand = 0; + + size = skb->end - skb->head + expand; + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + return -ENOMEM; + + /* Copy entire thing */ + if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) + BUG(); + + /* Set up shinfo */ + ninfo = (struct skb_shared_info*)(data + size); + atomic_set(&ninfo->dataref, 1); + ninfo->tso_size = skb_shinfo(skb)->tso_size; + ninfo->tso_segs = skb_shinfo(skb)->tso_segs; + ninfo->nr_frags = 0; + ninfo->frag_list = NULL; + + /* Offset between the two in bytes */ + offset = data - skb->head; + + /* Free old data. */ + skb_release_data(skb); + + skb->head = data; + skb->end = data + size; + + /* Set up new pointers */ + skb->h.raw += offset; + skb->nh.raw += offset; + skb->mac.raw += offset; + skb->tail += offset; + skb->data += offset; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; + + skb->tail += skb->data_len; + skb->data_len = 0; + return 0; +} + +#define HARD_TX_LOCK(dev, cpu) { \ + if ((dev->features & NETIF_F_LLTX) == 0) { \ + spin_lock(&dev->xmit_lock); \ + dev->xmit_lock_owner = cpu; \ + } \ +} + +#define HARD_TX_UNLOCK(dev) { \ + if ((dev->features & NETIF_F_LLTX) == 0) { \ + dev->xmit_lock_owner = -1; \ + spin_unlock(&dev->xmit_lock); \ + } \ +} + +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + */ + +int dev_queue_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct Qdisc *q; + int rc = -ENOMEM; + + if (skb_shinfo(skb)->frag_list && + !(dev->features & NETIF_F_FRAGLIST) && + __skb_linearize(skb, GFP_ATOMIC)) + goto out_kfree_skb; + + /* Fragmented skb is linearized if device does not support SG, + * or if at least one of fragments is in highmem and device + * does not support DMA from it. + */ + if (skb_shinfo(skb)->nr_frags && + (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && + __skb_linearize(skb, GFP_ATOMIC)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not support + * checksumming for this protocol, complete checksumming here. + */ + if (skb->ip_summed == CHECKSUM_HW && + (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + (!(dev->features & NETIF_F_IP_CSUM) || + skb->protocol != htons(ETH_P_IP)))) + if (skb_checksum_help(skb, 0)) + goto out_kfree_skb; + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + local_bh_disable(); + + /* Updates of qdisc are serialized by queue_lock. + * The struct Qdisc which is pointed to by qdisc is now a + * rcu structure - it may be accessed without acquiring + * a lock (but the structure may be stale.) The freeing of the + * qdisc will be deferred until it's known that there are no + * more references to it. + * + * If the qdisc has an enqueue function, we still need to + * hold the queue_lock before calling it, since queue_lock + * also serializes access to the device queue. + */ + + q = rcu_dereference(dev->qdisc); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); +#endif + if (q->enqueue) { + /* Grab device queue */ + spin_lock(&dev->queue_lock); + + rc = q->enqueue(skb, q); + + qdisc_run(dev); + + spin_unlock(&dev->queue_lock); + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that xmit_lock protection is necessary here. + (f.e. loopback and IP tunnels are clean ignoring statistics + counters.) + However, it is possible, that they rely on protection + made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + if (dev->xmit_lock_owner != cpu) { + + HARD_TX_LOCK(dev, cpu); + + if (!netif_queue_stopped(dev)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + rc = 0; + if (!dev->hard_start_xmit(skb, dev)) { + HARD_TX_UNLOCK(dev); + goto out; + } + } + HARD_TX_UNLOCK(dev); + if (net_ratelimit()) + printk(KERN_CRIT "Virtual device %s asks to " + "queue packet!\n", dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately */ + if (net_ratelimit()) + printk(KERN_CRIT "Dead loop on virtual device " + "%s, fix it urgently!\n", dev->name); + } + } + + rc = -ENETDOWN; + local_bh_enable(); + +out_kfree_skb: + kfree_skb(skb); + return rc; +out: + local_bh_enable(); + return rc; +} + + +/*======================================================================= + Receiver routines + =======================================================================*/ + +int netdev_max_backlog = 300; +int weight_p = 64; /* old backlog weight */ +/* These numbers are selected based on intuition and some + * experimentatiom, if you have more scientific way of doing this + * please go ahead and fix things. + */ +int no_cong_thresh = 10; +int no_cong = 20; +int lo_cong = 100; +int mod_cong = 290; + +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; + + +static void get_sample_stats(int cpu) +{ +#ifdef RAND_LIE + unsigned long rd; + int rq; +#endif + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + int blog = sd->input_pkt_queue.qlen; + int avg_blog = sd->avg_blog; + + avg_blog = (avg_blog >> 1) + (blog >> 1); + + if (avg_blog > mod_cong) { + /* Above moderate congestion levels. */ + sd->cng_level = NET_RX_CN_HIGH; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + sd->cng_level = NET_RX_DROP; +#endif + } else if (avg_blog > lo_cong) { + sd->cng_level = NET_RX_CN_MOD; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + sd->cng_level = NET_RX_CN_HIGH; +#endif + } else if (avg_blog > no_cong) + sd->cng_level = NET_RX_CN_LOW; + else /* no congestion */ + sd->cng_level = NET_RX_SUCCESS; + + sd->avg_blog = avg_blog; +} + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy) +{ +/* 10 ms 0r 1ms -- i don't care -- JHS */ + int next_tick = 1; + int cpu = smp_processor_id(); + + get_sample_stats(cpu); + next_tick += jiffies; + mod_timer(&samp_timer, next_tick); +} +#endif + + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_CN_LOW (low congestion) + * NET_RX_CN_MOD (moderate congestion) + * NET_RX_CN_HIGH (high congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + int this_cpu; + struct softnet_data *queue; + unsigned long flags; + + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) + return NET_RX_DROP; + + if (!skb->stamp.tv_sec) + net_timestamp(&skb->stamp); + + /* + * The code is rearranged so that the path is the most + * short when CPU is congested, but is still operating. + */ + local_irq_save(flags); + this_cpu = smp_processor_id(); + queue = &__get_cpu_var(softnet_data); + + __get_cpu_var(netdev_rx_stat).total++; + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { + if (queue->throttle) + goto drop; + +enqueue: + dev_hold(skb->dev); + __skb_queue_tail(&queue->input_pkt_queue, skb); +#ifndef OFFLINE_SAMPLE + get_sample_stats(this_cpu); +#endif + local_irq_restore(flags); + return queue->cng_level; + } + + if (queue->throttle) + queue->throttle = 0; + + netif_rx_schedule(&queue->backlog_dev); + goto enqueue; + } + + if (!queue->throttle) { + queue->throttle = 1; + __get_cpu_var(netdev_rx_stat).throttled++; + } + +drop: + __get_cpu_var(netdev_rx_stat).dropped++; + local_irq_restore(flags); + + kfree_skb(skb); + return NET_RX_DROP; +} + +int netif_rx_ni(struct sk_buff *skb) +{ + int err; + + preempt_disable(); + err = netif_rx(skb); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); + + return err; +} + +EXPORT_SYMBOL(netif_rx_ni); + +static __inline__ void skb_bond(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + if (dev->master) { + skb->real_dev = skb->dev; + skb->dev = dev->master; + } +} + +static void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + clist = clist->next; + + BUG_TRAP(!atomic_read(&skb->users)); + __kfree_skb(skb); + } + } + + if (sd->output_queue) { + struct net_device *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + local_irq_enable(); + + while (head) { + struct net_device *dev = head; + head = head->next_sched; + + smp_mb__before_clear_bit(); + clear_bit(__LINK_STATE_SCHED, &dev->state); + + if (spin_trylock(&dev->queue_lock)) { + qdisc_run(dev); + spin_unlock(&dev->queue_lock); + } else { + netif_schedule(dev); + } + } + } +} + +static __inline__ int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev); +} + +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); + +static __inline__ int handle_bridge(struct sk_buff **pskb, + struct packet_type **pt_prev, int *ret) +{ + struct net_bridge_port *port; + + if ((*pskb)->pkt_type == PACKET_LOOPBACK || + (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) + return 0; + + if (*pt_prev) { + *ret = deliver_skb(*pskb, *pt_prev); + *pt_prev = NULL; + } + + return br_handle_frame_hook(port, pskb); +} +#else +#define handle_bridge(skb, pt_prev, ret) (0) +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesnt stop any functionality; if you dont have + * the ingress scheduler, you just cant add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb) +{ + struct Qdisc *q; + struct net_device *dev = skb->dev; + int result = TC_ACT_OK; + + if (dev->qdisc_ingress) { + __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); + if (MAX_RED_LOOP < ttl++) { + printk("Redir loop detected Dropping packet (%s->%s)\n", + skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + return TC_ACT_SHOT; + } + + skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); + + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); + if (NULL == skb->input_dev) { + skb->input_dev = skb->dev; + printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); + } + spin_lock(&dev->ingress_lock); + if ((q = dev->qdisc_ingress) != NULL) + result = q->enqueue(skb, q); + spin_unlock(&dev->ingress_lock); + + } + + return result; +} +#endif + +int netif_receive_skb(struct sk_buff *skb) +{ + struct packet_type *ptype, *pt_prev; + int ret = NET_RX_DROP; + unsigned short type; + + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) + return NET_RX_DROP; + + if (!skb->stamp.tv_sec) + net_timestamp(&skb->stamp); + + skb_bond(skb); + + __get_cpu_var(netdev_rx_stat).total++; + + skb->h.raw = skb->nh.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->mac.raw; + + pt_prev = NULL; + + rcu_read_lock(); + +#ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + goto ncls; + } +#endif + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev); + pt_prev = ptype; + } + } + +#ifdef CONFIG_NET_CLS_ACT + if (pt_prev) { + ret = deliver_skb(skb, pt_prev); + pt_prev = NULL; /* noone else should process this after*/ + } else { + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + } + + ret = ing_filter(skb); + + if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { + kfree_skb(skb); + goto out; + } + + skb->tc_verd = 0; +ncls: +#endif + + handle_diverter(skb); + + if (handle_bridge(&skb, &pt_prev, &ret)) + goto out; + + type = skb->protocol; + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { + if (ptype->type == type && + (!ptype->dev || ptype->dev == skb->dev)) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev); + pt_prev = ptype; + } + } + + if (pt_prev) { + ret = pt_prev->func(skb, skb->dev, pt_prev); + } else { + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + rcu_read_unlock(); + return ret; +} + +static int process_backlog(struct net_device *backlog_dev, int *budget) +{ + int work = 0; + int quota = min(backlog_dev->quota, *budget); + struct softnet_data *queue = &__get_cpu_var(softnet_data); + unsigned long start_time = jiffies; + + for (;;) { + struct sk_buff *skb; + struct net_device *dev; + + local_irq_disable(); + skb = __skb_dequeue(&queue->input_pkt_queue); + if (!skb) + goto job_done; + local_irq_enable(); + + dev = skb->dev; + + netif_receive_skb(skb); + + dev_put(dev); + + work++; + + if (work >= quota || jiffies - start_time > 1) + break; + + } + + backlog_dev->quota -= work; + *budget -= work; + return -1; + +job_done: + backlog_dev->quota -= work; + *budget -= work; + + list_del(&backlog_dev->poll_list); + smp_mb__before_clear_bit(); + netif_poll_enable(backlog_dev); + + if (queue->throttle) + queue->throttle = 0; + local_irq_enable(); + return 0; +} + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *queue = &__get_cpu_var(softnet_data); + unsigned long start_time = jiffies; + int budget = netdev_max_backlog; + + + local_irq_disable(); + + while (!list_empty(&queue->poll_list)) { + struct net_device *dev; + + if (budget <= 0 || jiffies - start_time > 1) + goto softnet_break; + + local_irq_enable(); + + dev = list_entry(queue->poll_list.next, + struct net_device, poll_list); + netpoll_poll_lock(dev); + + if (dev->quota <= 0 || dev->poll(dev, &budget)) { + netpoll_poll_unlock(dev); + local_irq_disable(); + list_del(&dev->poll_list); + list_add_tail(&dev->poll_list, &queue->poll_list); + if (dev->quota < 0) + dev->quota += dev->weight; + else + dev->quota = dev->weight; + } else { + netpoll_poll_unlock(dev); + dev_put(dev); + local_irq_disable(); + } + } +out: + local_irq_enable(); + return; + +softnet_break: + __get_cpu_var(netdev_rx_stat).time_squeeze++; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; +} + +static gifconf_func_t * gifconf_list [NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifr.ifr_ifindex); + if (!dev) { + read_unlock(&dev_base_lock); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + read_unlock(&dev_base_lock); + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for (dev = dev_base; dev; dev = dev->next) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +static __inline__ struct net_device *dev_get_idx(loff_t pos) +{ + struct net_device *dev; + loff_t i; + + for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); + + return i == pos ? dev : NULL; +} + +void *dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; +} + +void dev_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&dev_base_lock); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); + } else + seq_printf(seq, "%6s: No statistics available.\n", dev->name); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct netif_rx_stats *softnet_get_online(loff_t *pos) +{ + struct netif_rx_stats *rc = NULL; + + while (*pos < NR_CPUS) + if (cpu_online(*pos)) { + rc = &per_cpu(netdev_rx_stat, *pos); + break; + } else + ++*pos; + return rc; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct netif_rx_stats *s = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + s->total, s->dropped, s->time_squeeze, s->throttled, + s->fastroute_hit, s->fastroute_success, s->fastroute_defer, + s->fastroute_deferred_out, +#if 0 + s->fastroute_latency_reduction +#else + s->cpu_collision +#endif + ); + return 0; +} + +static struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dev_seq_ops); +} + +static struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef WIRELESS_EXT +extern int wireless_proc_init(void); +#else +#define wireless_proc_init() 0 +#endif + +static int __init dev_proc_init(void) +{ + int rc = -ENOMEM; + + if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; + if (wireless_proc_init()) + goto out_softnet; + rc = 0; +out: + return rc; +out_softnet: + proc_net_remove("softnet_stat"); +out_dev: + proc_net_remove("dev"); + goto out; +} +#else +#define dev_proc_init() 0 +#endif /* CONFIG_PROC_FS */ + + +/** + * netdev_set_master - set up master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted, %RTM_NEWLINK is sent to the routing socket and the + * function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ + struct net_device *old = slave->master; + + ASSERT_RTNL(); + + if (master) { + if (old) + return -EBUSY; + dev_hold(master); + } + + slave->master = master; + + synchronize_net(); + + if (old) + dev_put(old); + + if (master) + slave->flags |= IFF_SLAVE; + else + slave->flags &= ~IFF_SLAVE; + + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promsicuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + */ +void dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags ^ old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags & IFF_PROMISC) ? "entered" : + "left"); + } +} + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + */ + +void dev_set_allmulti(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags ^ old_flags) + dev_mc_upload(dev); +} + +unsigned dev_get_flags(const struct net_device *dev) +{ + unsigned flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev) && netif_carrier_ok(dev)) + flags |= IFF_RUNNING; + + return flags; +} + +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + + if (!ret) + dev_mc_upload(dev); + } + + if (dev->flags & IFF_UP && + ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | + IFF_VOLATILE))) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? +1 : -1; + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + if (old_flags ^ dev->flags) + rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); + + return ret; +} + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int err; + + if (new_mtu == dev->mtu) + return 0; + + /* MTU must be positive. */ + if (new_mtu < 0) + return -EINVAL; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = 0; + if (dev->change_mtu) + err = dev->change_mtu(dev, new_mtu); + else + dev->mtu = new_mtu; + if (!err && dev->flags & IFF_UP) + notifier_call_chain(&netdev_chain, + NETDEV_CHANGEMTU, dev); + return err; +} + +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + int err; + + if (!dev->set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = dev->set_mac_address(dev, sa); + if (!err) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return err; +} + +/* + * Perform the SIOCxIFxxx calls. + */ +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = dev_get_flags(dev); + return 0; + + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + notifier_call_chain(&netdev_chain, + NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCSIFMAP: + if (dev->set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return dev->set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if (!dev->set_multicast_list || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCDELMULTI: + if (!dev->set_multicast_list || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + /* + * Unknown or private ioctl + */ + + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (dev->do_ioctl) { + if (netif_device_present(dev)) + err = dev->do_ioctl(dev, ifr, + cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + ret = dev_ifconf((char __user *) arg); + rtnl_shunlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname((struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); + read_lock(&dev_base_lock); + ret = dev_ifsioc(&ifr, cmd); + read_unlock(&dev_base_lock); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(&ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } +#ifdef WIRELESS_EXT + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + /* If command is `set a parameter', or + * `get the encoding parameters', check if + * the user has the right to do it */ + if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + } + dev_load(ifr.ifr_name); + rtnl_lock(); + /* Follow me in net/core/wireless.c */ + ret = wireless_process_ioctl(&ifr, cmd); + rtnl_unlock(); + if (IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } +#endif /* WIRELESS_EXT */ + return -EINVAL; + } +} + + +/** + * dev_new_index - allocate an ifindex + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(void) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(ifindex)) + return ifindex; + } +} + +static int dev_boot_phase = 1; + +/* Delayed registration/unregisteration */ +static DEFINE_SPINLOCK(net_todo_list_lock); +static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); + +static inline void net_set_todo(struct net_device *dev) +{ + spin_lock(&net_todo_list_lock); + list_add_tail(&dev->todo_list, &net_todo_list); + spin_unlock(&net_todo_list_lock); +} + +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * Callers must hold the rtnl semaphore. You may want + * register_netdev() instead of this. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *p; + int ret; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; +#ifdef CONFIG_NET_CLS_ACT + spin_lock_init(&dev->ingress_lock); +#endif + + ret = alloc_divert_blk(dev); + if (ret) + goto out; + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init) { + ret = dev->init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out_err; + } + } + + if (!dev_valid_name(dev->name)) { + ret = -EINVAL; + goto out_err; + } + + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Check for existence of name */ + head = dev_name_hash(dev->name); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + ret = -EEXIST; + goto out_err; + } + } + + /* Fix illegal SG+CSUM combinations. */ + if ((dev->features & NETIF_F_SG) && + !(dev->features & (NETIF_F_IP_CSUM | + NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM))) { + printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", + dev->name); + dev->features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((dev->features & NETIF_F_TSO) && + !(dev->features & NETIF_F_SG)) { + printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", + dev->name); + dev->features &= ~NETIF_F_TSO; + } + + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (!dev->rebuild_header) + dev->rebuild_header = default_rebuild_header; + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev->next = NULL; + dev_init_scheduler(dev); + write_lock_bh(&dev_base_lock); + *dev_tail = dev; + dev_tail = &dev->next; + hlist_add_head(&dev->name_hlist, head); + hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); + dev_hold(dev); + dev->reg_state = NETREG_REGISTERING; + write_unlock_bh(&dev_base_lock); + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + /* Finish registration after unlock */ + net_set_todo(dev); + ret = 0; + +out: + return ret; +out_err: + free_divert_blk(dev); + goto out; +} + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdev that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + rtnl_lock(); + + /* + * If the name is a format string the caller wants us to do a + * name allocation. + */ + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto out; + } + + /* + * Back compatibility hook. Kill this one in 2.5 + */ + if (dev->name[0] == 0 || dev->name[0] == ' ') { + err = dev_alloc_name(dev, "eth%d"); + if (err < 0) + goto out; + } + + err = register_netdevice(dev); +out: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ + unsigned long rebroadcast_time, warning_time; + + rebroadcast_time = warning_time = jiffies; + while (atomic_read(&dev->refcnt) != 0) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_shlock(); + + /* Rebroadcast unregister notification */ + notifier_call_chain(&netdev_chain, + NETDEV_UNREGISTER, dev); + + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + } + + rtnl_shunlock(); + + rebroadcast_time = jiffies; + } + + msleep(250); + + if (time_after(jiffies, warning_time + 10 * HZ)) { + printk(KERN_EMERG "unregister_netdevice: " + "waiting for %s to become free. Usage " + "count = %d\n", + dev->name, atomic_read(&dev->refcnt)); + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock() after it drops the semaphore. + * This allows us to deal with problems: + * 1) We can create/delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + */ +static DECLARE_MUTEX(net_todo_run_mutex); +void netdev_run_todo(void) +{ + struct list_head list = LIST_HEAD_INIT(list); + int err; + + + /* Need to guard against multiple cpu's getting out of order. */ + down(&net_todo_run_mutex); + + /* Not safe to do outside the semaphore. We must not return + * until all unregister events invoked by the local processor + * have been completed (either by this todo run, or one on + * another cpu). + */ + if (list_empty(&net_todo_list)) + goto out; + + /* Snapshot list, allow later requests */ + spin_lock(&net_todo_list_lock); + list_splice_init(&net_todo_list, &list); + spin_unlock(&net_todo_list_lock); + + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); + list_del(&dev->todo_list); + + switch(dev->reg_state) { + case NETREG_REGISTERING: + err = netdev_register_sysfs(dev); + if (err) + printk(KERN_ERR "%s: failed sysfs registration (%d)\n", + dev->name, err); + dev->reg_state = NETREG_REGISTERED; + break; + + case NETREG_UNREGISTERING: + netdev_unregister_sysfs(dev); + dev->reg_state = NETREG_UNREGISTERED; + + netdev_wait_allrefs(dev); + + /* paranoia */ + BUG_ON(atomic_read(&dev->refcnt)); + BUG_TRAP(!dev->ip_ptr); + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); + + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ + if (dev->destructor) + dev->destructor(dev); + break; + + default: + printk(KERN_ERR "network todo '%s' but state %d\n", + dev->name, dev->reg_state); + break; + } + } + +out: + up(&net_todo_run_mutex); +} + +/** + * alloc_netdev - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @setup: callback to initialize device + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. + */ +struct net_device *alloc_netdev(int sizeof_priv, const char *name, + void (*setup)(struct net_device *)) +{ + void *p; + struct net_device *dev; + int alloc_size; + + /* ensure 32-byte alignment of both the device and private area */ + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + + p = kmalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + return NULL; + } + memset(p, 0, alloc_size); + + dev = (struct net_device *) + (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); + dev->padded = (char *)dev - (char *)p; + + if (sizeof_priv) + dev->priv = netdev_priv(dev); + + setup(dev); + strcpy(dev->name, name); + return dev; +} +EXPORT_SYMBOL(alloc_netdev); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. + * If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ +#ifdef CONFIG_SYSFS + /* Compatiablity with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + kfree((char *)dev - dev->padded); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via class release */ + class_device_put(&dev->class_dev); +#else + kfree((char *)dev - dev->padded); +#endif +} + +/* Synchronize with packet receive processing. */ +void synchronize_net(void) +{ + might_sleep(); + synchronize_kernel(); +} + +/** + * unregister_netdevice - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. On success 0 is returned, on a failure + * a negative errno code is returned. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +int unregister_netdevice(struct net_device *dev) +{ + struct net_device *d, **dp; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + /* Some devices call without registering for initialization unwind. */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); + return -ENODEV; + } + + BUG_ON(dev->reg_state != NETREG_REGISTERED); + + /* If device is running, close it first. */ + if (dev->flags & IFF_UP) + dev_close(dev); + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { + if (d == dev) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + if (dev_tail == &dev->next) + dev_tail = dp; + *dp = d->next; + write_unlock_bh(&dev_base_lock); + break; + } + } + if (!d) { + printk(KERN_ERR "unregister net_device: '%s' not found\n", + dev->name); + return -ENODEV; + } + + dev->reg_state = NETREG_UNREGISTERING; + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + if (dev->uninit) + dev->uninit(dev); + + /* Notifier chain MUST detach us from master device. */ + BUG_TRAP(!dev->master); + + free_divert_blk(dev); + + /* Finish processing unregister after unlock */ + net_set_todo(dev); + + synchronize_net(); + + dev_put(dev); + return 0; +} + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. On success 0 is returned, on a failure + * a negative errno code is returned. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} + +EXPORT_SYMBOL(unregister_netdev); + +#ifdef CONFIG_HOTPLUG_CPU +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct net_device **list_net; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_DEAD) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Find end of our output_queue. */ + list_net = &sd->output_queue; + while (*list_net) + list_net = &(*list_net)->next_sched; + /* Append output queue from offline CPU. */ + *list_net = oldsd->output_queue; + oldsd->output_queue = NULL; + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + netif_rx(skb); + + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + net_random_init(); + + if (dev_proc_init()) + goto out; + + if (netdev_sysfs_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < 16; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) + INIT_HLIST_HEAD(&dev_name_head[i]); + + for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) + INIT_HLIST_HEAD(&dev_index_head[i]); + + /* + * Initialise the packet receive queues. + */ + + for (i = 0; i < NR_CPUS; i++) { + struct softnet_data *queue; + + queue = &per_cpu(softnet_data, i); + skb_queue_head_init(&queue->input_pkt_queue); + queue->throttle = 0; + queue->cng_level = 0; + queue->avg_blog = 10; /* arbitrary non-zero */ + queue->completion_queue = NULL; + INIT_LIST_HEAD(&queue->poll_list); + set_bit(__LINK_STATE_START, &queue->backlog_dev.state); + queue->backlog_dev.weight = weight_p; + queue->backlog_dev.poll = process_backlog; + atomic_set(&queue->backlog_dev.refcnt, 1); + } + +#ifdef OFFLINE_SAMPLE + samp_timer.expires = jiffies + (10 * HZ); + add_timer(&samp_timer); +#endif + + dev_boot_phase = 0; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); + open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); + + hotcpu_notifier(dev_cpu_callback, 0); + dst_init(); + dev_mcast_init(); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); + +EXPORT_SYMBOL(__dev_get_by_index); +EXPORT_SYMBOL(__dev_get_by_name); +EXPORT_SYMBOL(__dev_remove_pack); +EXPORT_SYMBOL(__skb_linearize); +EXPORT_SYMBOL(dev_add_pack); +EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(dev_get_by_flags); +EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(dev_ioctl); +EXPORT_SYMBOL(dev_open); +EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(dev_remove_pack); +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_change_flags); +EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); +EXPORT_SYMBOL(free_netdev); +EXPORT_SYMBOL(netdev_boot_setup_check); +EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_rx); +EXPORT_SYMBOL(register_gifconf); +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(register_netdevice_notifier); +EXPORT_SYMBOL(skb_checksum_help); +EXPORT_SYMBOL(synchronize_net); +EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_notifier); +EXPORT_SYMBOL(net_enable_timestamp); +EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); +#endif + +#ifdef CONFIG_KMOD +EXPORT_SYMBOL(dev_load); +#endif + +EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c new file mode 100644 index 000000000000..db098ff3cd6a --- /dev/null +++ b/net/core/dev_mcast.c @@ -0,0 +1,299 @@ +/* + * Linux NET3: Multicast List maintenance. + * + * Authors: + * Tim Kordas <tjk@nostromo.eeap.cwru.edu> + * Richard Underwood <richard@wuzz.demon.co.uk> + * + * Stir fried together from the IP multicast and CAP patches above + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Update the device on a real delete + * rather than any time but... + * Alan Cox : IFF_ALLMULTI support. + * Alan Cox : New format set_multicast_list() calls. + * Gleb Natapov : Remove dev_mc_lock. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> + + +/* + * Device multicast list maintenance. + * + * This is used both by IP and by the user level maintenance functions. + * Unlike BSD we maintain a usage count on a given multicast address so + * that a casual user application can add/delete multicasts used by + * protocols without doing damage to the protocols when it deletes the + * entries. It also helps IP as it tracks overlapping maps. + * + * Device mc lists are changed by bh at least if IPv6 is enabled, + * so that it must be bh protected. + * + * We block accesses to device mc filters with dev->xmit_lock. + */ + +/* + * Update the multicast list into the physical NIC controller. + */ + +static void __dev_mc_upload(struct net_device *dev) +{ + /* Don't do anything till we up the interface + * [dev_open will call this function so the list will + * stay sane] + */ + + if (!(dev->flags&IFF_UP)) + return; + + /* + * Devices with no set multicast or which have been + * detached don't get set. + */ + + if (dev->set_multicast_list == NULL || + !netif_device_present(dev)) + return; + + dev->set_multicast_list(dev); +} + +void dev_mc_upload(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __dev_mc_upload(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +/* + * Delete a device level multicast + */ + +int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, **dmip; + + spin_lock_bh(&dev->xmit_lock); + + for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { + /* + * Find the entry we want to delete. The device could + * have variable length entries so check these too. + */ + if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && + alen == dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + break; + } + if (--dmi->dmi_users) + goto done; + + /* + * Last user. So delete the entry. + */ + *dmip = dmi->next; + dev->mc_count--; + + kfree(dmi); + + /* + * We have altered the list, so the card + * loaded filter is now wrong. Fix it + */ + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); + return 0; + } + } + err = -ENOENT; +done: + spin_unlock_bh(&dev->xmit_lock); + return err; +} + +/* + * Add a device level multicast + */ + +int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, *dmi1; + + dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + + spin_lock_bh(&dev->xmit_lock); + for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { + if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && + dmi->dmi_addrlen == alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + goto done; + } + dmi->dmi_users++; + goto done; + } + } + + if ((dmi = dmi1) == NULL) { + spin_unlock_bh(&dev->xmit_lock); + return -ENOMEM; + } + memcpy(dmi->dmi_addr, addr, alen); + dmi->dmi_addrlen = alen; + dmi->next = dev->mc_list; + dmi->dmi_users = 1; + dmi->dmi_gusers = glbl ? 1 : 0; + dev->mc_list = dmi; + dev->mc_count++; + + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); + return 0; + +done: + spin_unlock_bh(&dev->xmit_lock); + if (dmi1) + kfree(dmi1); + return err; +} + +/* + * Discard multicast list when a device is downed + */ + +void dev_mc_discard(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + + while (dev->mc_list != NULL) { + struct dev_mc_list *tmp = dev->mc_list; + dev->mc_list = tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); + kfree(tmp); + } + dev->mc_count = 0; + + spin_unlock_bh(&dev->xmit_lock); +} + +#ifdef CONFIG_PROC_FS +static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + loff_t off = 0; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + if (off++ == *pos) + return dev; + } + return NULL; +} + +static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev = v; + ++*pos; + return dev->next; +} + +static void dev_mc_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&dev_base_lock); +} + + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct dev_mc_list *m; + struct net_device *dev = v; + + spin_lock_bh(&dev->xmit_lock); + for (m = dev->mc_list; m; m = m->next) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, m->dmi_users, m->dmi_gusers); + + for (i = 0; i < m->dmi_addrlen; i++) + seq_printf(seq, "%02x", m->dmi_addr[i]); + + seq_putc(seq, '\n'); + } + spin_unlock_bh(&dev->xmit_lock); + return 0; +} + +static struct seq_operations dev_mc_seq_ops = { + .start = dev_mc_seq_start, + .next = dev_mc_seq_next, + .stop = dev_mc_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dev_mc_seq_ops); +} + +static struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +void __init dev_mcast_init(void) +{ + proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); +} + +EXPORT_SYMBOL(dev_mc_add); +EXPORT_SYMBOL(dev_mc_delete); +EXPORT_SYMBOL(dev_mc_upload); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 000000000000..3bf6cc434814 --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,276 @@ +/* + * net/core/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <net/dst.h> + +/* Locking strategy: + * 1) Garbage collection state of dead destination cache + * entries is protected by dst_lock. + * 2) GC is run only from BH context, and is the only remover + * of entries. + * 3) Entries are added to the garbage list from both BH + * and non-BH context, so local BH disabling is needed. + * 4) All operations modify state, so a spinlock is used. + */ +static struct dst_entry *dst_garbage_list; +#if RT_CACHE_DEBUG >= 2 +static atomic_t dst_total = ATOMIC_INIT(0); +#endif +static DEFINE_SPINLOCK(dst_lock); + +static unsigned long dst_gc_timer_expires; +static unsigned long dst_gc_timer_inc = DST_GC_MAX; +static void dst_run_gc(unsigned long); +static void ___dst_free(struct dst_entry * dst); + +static struct timer_list dst_gc_timer = + TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0); + +static void dst_run_gc(unsigned long dummy) +{ + int delayed = 0; + struct dst_entry * dst, **dstp; + + if (!spin_trylock(&dst_lock)) { + mod_timer(&dst_gc_timer, jiffies + HZ/10); + return; + } + + + del_timer(&dst_gc_timer); + dstp = &dst_garbage_list; + while ((dst = *dstp) != NULL) { + if (atomic_read(&dst->__refcnt)) { + dstp = &dst->next; + delayed++; + continue; + } + *dstp = dst->next; + + dst = dst_destroy(dst); + if (dst) { + /* NOHASH and still referenced. Unless it is already + * on gc list, invalidate it and add to gc list. + * + * Note: this is temporary. Actually, NOHASH dst's + * must be obsoleted when parent is obsoleted. + * But we do not have state "obsoleted, but + * referenced by parent", so it is right. + */ + if (dst->obsolete > 1) + continue; + + ___dst_free(dst); + dst->next = *dstp; + *dstp = dst; + dstp = &dst->next; + } + } + if (!dst_garbage_list) { + dst_gc_timer_inc = DST_GC_MAX; + goto out; + } + if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) + dst_gc_timer_expires = DST_GC_MAX; + dst_gc_timer_inc += DST_GC_INC; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; +#if RT_CACHE_DEBUG >= 2 + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); +#endif + add_timer(&dst_gc_timer); + +out: + spin_unlock(&dst_lock); +} + +static int dst_discard_in(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int dst_discard_out(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +void * dst_alloc(struct dst_ops * ops) +{ + struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } + dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC); + if (!dst) + return NULL; + memset(dst, 0, ops->entry_size); + atomic_set(&dst->__refcnt, 0); + dst->ops = ops; + dst->lastuse = jiffies; + dst->path = dst; + dst->input = dst_discard_in; + dst->output = dst_discard_out; +#if RT_CACHE_DEBUG >= 2 + atomic_inc(&dst_total); +#endif + atomic_inc(&ops->entries); + return dst; +} + +static void ___dst_free(struct dst_entry * dst) +{ + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard_in; + dst->output = dst_discard_out; + } + dst->obsolete = 2; +} + +void __dst_free(struct dst_entry * dst) +{ + spin_lock_bh(&dst_lock); + ___dst_free(dst); + dst->next = dst_garbage_list; + dst_garbage_list = dst; + if (dst_gc_timer_inc > DST_GC_INC) { + dst_gc_timer_inc = DST_GC_INC; + dst_gc_timer_expires = DST_GC_MIN; + mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); + } + spin_unlock_bh(&dst_lock); +} + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ + struct dst_entry *child; + struct neighbour *neigh; + struct hh_cache *hh; + + smp_rmb(); + +again: + neigh = dst->neighbour; + hh = dst->hh; + child = dst->child; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + if (dst->dev) + dev_put(dst->dev); +#if RT_CACHE_DEBUG >= 2 + atomic_dec(&dst_total); +#endif + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) { + if (atomic_dec_and_test(&dst->__refcnt)) { + /* We were real parent of this dst, so kill child. */ + if (dst->flags&DST_NOHASH) + goto again; + } else { + /* Child is still referenced, return it for freeing. */ + if (dst->flags&DST_NOHASH) + return dst; + /* Child is still in his hash table */ + } + } + return NULL; +} + +/* Dirty hack. We did it in 2.2 (in __dst_free), + * we have _very_ good reasons not to repeat + * this mistake in 2.3, but we have no choice + * now. _It_ _is_ _explicit_ _deliberate_ + * _race_ _condition_. + * + * Commented and originally written by Alexey. + */ +static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, unregister); + + if (dev != dst->dev) + return; + + if (!unregister) { + dst->input = dst_discard_in; + dst->output = dst_discard_out; + } else { + dst->dev = &loopback_dev; + dev_hold(&loopback_dev); + dev_put(dev); + if (dst->neighbour && dst->neighbour->dev == dev) { + dst->neighbour->dev = &loopback_dev; + dev_put(dev); + dev_hold(&loopback_dev); + } + } +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dst_entry *dst; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + spin_lock_bh(&dst_lock); + for (dst = dst_garbage_list; dst; dst = dst->next) { + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } + spin_unlock_bh(&dst_lock); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block dst_dev_notifier = { + .notifier_call = dst_dev_event, +}; + +void __init dst_init(void) +{ + register_netdevice_notifier(&dst_dev_notifier); +} + +EXPORT_SYMBOL(__dst_free); +EXPORT_SYMBOL(dst_alloc); +EXPORT_SYMBOL(dst_destroy); diff --git a/net/core/dv.c b/net/core/dv.c new file mode 100644 index 000000000000..3f25f4aa4e66 --- /dev/null +++ b/net/core/dv.c @@ -0,0 +1,548 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic frame diversion + * + * Authors: + * Benoit LOCHER: initial integration within the kernel with support for ethernet + * Dave Miller: improvement on the code (correctness, performance and source files) + * + */ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <net/dst.h> +#include <net/arp.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/checksum.h> +#include <linux/divert.h> +#include <linux/sockios.h> + +const char sysctl_divert_version[32]="0.46"; /* Current version */ + +static int __init dv_init(void) +{ + return 0; +} +module_init(dv_init); + +/* + * Allocate a divert_blk for a device. This must be an ethernet nic. + */ +int alloc_divert_blk(struct net_device *dev) +{ + int alloc_size = (sizeof(struct divert_blk) + 3) & ~3; + + dev->divert = NULL; + if (dev->type == ARPHRD_ETHER) { + dev->divert = (struct divert_blk *) + kmalloc(alloc_size, GFP_KERNEL); + if (dev->divert == NULL) { + printk(KERN_INFO "divert: unable to allocate divert_blk for %s\n", + dev->name); + return -ENOMEM; + } + + memset(dev->divert, 0, sizeof(struct divert_blk)); + dev_hold(dev); + } + + return 0; +} + +/* + * Free a divert_blk allocated by the above function, if it was + * allocated on that device. + */ +void free_divert_blk(struct net_device *dev) +{ + if (dev->divert) { + kfree(dev->divert); + dev->divert=NULL; + dev_put(dev); + } +} + +/* + * Adds a tcp/udp (source or dest) port to an array + */ +static int add_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave :) + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) + return -EALREADY; + } + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == 0) { + ports[i] = port; + return 0; + } + } + + return -ENOBUFS; +} + +/* + * Removes a port from an array tcp/udp (source or dest) + */ +static int remove_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave ! + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) { + ports[i] = 0; + return 0; + } + } + + return -EINVAL; +} + +/* Some basic sanity checks on the arguments passed to divert_ioctl() */ +static int check_args(struct divert_cf *div_cf, struct net_device **dev) +{ + char devname[32]; + int ret; + + if (dev == NULL) + return -EFAULT; + + /* GETVERSION: all other args are unused */ + if (div_cf->cmd == DIVCMD_GETVERSION) + return 0; + + /* Network device index should reasonably be between 0 and 1000 :) */ + if (div_cf->dev_index < 0 || div_cf->dev_index > 1000) + return -EINVAL; + + /* Let's try to find the ifname */ + sprintf(devname, "eth%d", div_cf->dev_index); + *dev = dev_get_by_name(devname); + + /* dev should NOT be null */ + if (*dev == NULL) + return -EINVAL; + + ret = 0; + + /* user issuing the ioctl must be a super one :) */ + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + + /* Device must have a divert_blk member NOT null */ + if ((*dev)->divert == NULL) + ret = -EINVAL; +out: + dev_put(*dev); + return ret; +} + +/* + * control function of the diverter + */ +#if 0 +#define DVDBG(a) \ + printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a)) +#else +#define DVDBG(a) +#endif + +int divert_ioctl(unsigned int cmd, struct divert_cf __user *arg) +{ + struct divert_cf div_cf; + struct divert_blk *div_blk; + struct net_device *dev; + int ret; + + switch (cmd) { + case SIOCGIFDIVERT: + DVDBG("SIOCGIFDIVERT, copy_from_user"); + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + DVDBG("before check_args"); + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + DVDBG("after checkargs"); + div_blk = dev->divert; + + DVDBG("befre switch()"); + switch (div_cf.cmd) { + case DIVCMD_GETSTATUS: + /* Now, just give the user the raw divert block + * for him to play with :) + */ + if (copy_to_user(div_cf.arg1.ptr, dev->divert, + sizeof(struct divert_blk))) + return -EFAULT; + break; + + case DIVCMD_GETVERSION: + DVDBG("GETVERSION: checking ptr"); + if (div_cf.arg1.ptr == NULL) + return -EINVAL; + DVDBG("GETVERSION: copying data to userland"); + if (copy_to_user(div_cf.arg1.ptr, + sysctl_divert_version, 32)) + return -EFAULT; + DVDBG("GETVERSION: data copied"); + break; + + default: + return -EINVAL; + } + + break; + + case SIOCSIFDIVERT: + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + + div_blk = dev->divert; + + switch(div_cf.cmd) { + case DIVCMD_RESET: + div_blk->divert = 0; + div_blk->protos = DIVERT_PROTO_NONE; + memset(div_blk->tcp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->tcp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + return 0; + + case DIVCMD_DIVERT: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->divert) + return -EALREADY; + div_blk->divert = 1; + break; + + case DIVARG1_DISABLE: + if (!div_blk->divert) + return -EALREADY; + div_blk->divert = 0; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_IP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_IP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_IP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_IP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_IP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_TCP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_TCP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_TCP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_TCP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_UDP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_UDP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_UDP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_UDP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_ICMP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_ICMP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_ICMP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_ICMP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_ICMP; + break; + + default: + return -EINVAL; + } + + break; + + default: + return -EINVAL; + } + + break; + + default: + return -EINVAL; + } + + return 0; +} + + +/* + * Check if packet should have its dest mac address set to the box itself + * for diversion + */ + +#define ETH_DIVERT_FRAME(skb) \ + memcpy(eth_hdr(skb), skb->dev->dev_addr, ETH_ALEN); \ + skb->pkt_type=PACKET_HOST + +void divert_frame(struct sk_buff *skb) +{ + struct ethhdr *eth = eth_hdr(skb); + struct iphdr *iph; + struct tcphdr *tcph; + struct udphdr *udph; + struct divert_blk *divert = skb->dev->divert; + int i, src, dst; + unsigned char *skb_data_end = skb->data + skb->len; + + /* Packet is already aimed at us, return */ + if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN)) + return; + + /* proto is not IP, do nothing */ + if (eth->h_proto != htons(ETH_P_IP)) + return; + + /* Divert all IP frames ? */ + if (divert->protos & DIVERT_PROTO_IP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP frame (thanks Dave) */ + iph = (struct iphdr *) skb->data; + if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed IP packet !\n"); + return; + } + + switch (iph->protocol) { + /* Divert all ICMP frames ? */ + case IPPROTO_ICMP: + if (divert->protos & DIVERT_PROTO_ICMP) { + ETH_DIVERT_FRAME(skb); + return; + } + break; + + /* Divert all TCP frames ? */ + case IPPROTO_TCP: + if (divert->protos & DIVERT_PROTO_TCP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * frame (thanx Dave) + */ + tcph = (struct tcphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(tcph+1)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed TCP packet !\n"); + return; + } + + /* Divert some tcp dst/src ports only ?*/ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->tcp_dst[i]; + src = divert->tcp_src[i]; + if ((dst && dst == tcph->dest) || + (src && src == tcph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + + /* Divert all UDP frames ? */ + case IPPROTO_UDP: + if (divert->protos & DIVERT_PROTO_UDP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * packet (thanks Dave) + */ + udph = (struct udphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(udph+1)) >= skb_data_end) { + printk(KERN_INFO + "divert: malformed UDP packet !\n"); + return; + } + + /* Divert some udp dst/src ports only ? */ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->udp_dst[i]; + src = divert->udp_src[i]; + if ((dst && dst == udph->dest) || + (src && src == udph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + } +} diff --git a/net/core/ethtool.c b/net/core/ethtool.c new file mode 100644 index 000000000000..f05fde97c43d --- /dev/null +++ b/net/core/ethtool.c @@ -0,0 +1,819 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. We fall back to calling do_ioctl() + * for drivers which haven't been converted to ethtool_ops yet. + * + * It's GPL, stupid. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <asm/uaccess.h> + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} + +u32 ethtool_op_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_IP_CSUM) != 0; +} + +int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_IP_CSUM; + else + dev->features &= ~NETIF_F_IP_CSUM; + + return 0; +} + +u32 ethtool_op_get_sg(struct net_device *dev) +{ + return (dev->features & NETIF_F_SG) != 0; +} + +int ethtool_op_set_sg(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_SG; + else + dev->features &= ~NETIF_F_SG; + + return 0; +} + +u32 ethtool_op_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} + +int ethtool_op_set_tso(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_TSO; + else + dev->features &= ~NETIF_F_TSO; + + return 0; +} + +/* Handlers for each ethtool command */ + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_drvinfo info; + struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_drvinfo) + return -EOPNOTSUPP; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + ops->get_drvinfo(dev, &info); + + if (ops->self_test_count) + info.testinfo_len = ops->self_test_count(dev); + if (ops->get_stats_count) + info.n_stats = ops->get_stats_count(dev); + if (ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); + if (ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_regs regs; + struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (regs.len > reglen) + regs.len = reglen; + + regbuf = kmalloc(reglen, GFP_USER); + if (!regbuf) + return -ENOMEM; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (copy_to_user(useraddr, regbuf, regs.len)) + goto out; + ret = 0; + + out: + kfree(regbuf); + return ret; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GMSGLVL }; + + if (!dev->ethtool_ops->get_msglevel) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_msglevel(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_msglevel) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_msglevel(dev, edata.data); + return 0; +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->get_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(eeprom.len, GFP_USER); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) + goto out; + + ret = ops->get_eeprom(dev, &eeprom, data); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + goto out; + if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->set_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(eeprom.len, GFP_USER); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) + goto out; + + ret = ops->set_eeprom(dev, &eeprom, data); + if (ret) + goto out; + + if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) + ret = -EFAULT; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam; + + if (!dev->ethtool_ops->set_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GRXCSUM }; + + if (!dev->ethtool_ops->get_rx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_rx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_rx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_rx_csum(dev, edata.data); + return 0; +} + +static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTXCSUM }; + + if (!dev->ethtool_ops->get_tx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_tx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int __ethtool_set_sg(struct net_device *dev, u32 data) +{ + int err; + + if (!data && dev->ethtool_ops->set_tso) { + err = dev->ethtool_ops->set_tso(dev, 0); + if (err) + return err; + } + + return dev->ethtool_ops->set_sg(dev, data); +} + +static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + int err; + + if (!dev->ethtool_ops->set_tx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (!edata.data && dev->ethtool_ops->set_sg) { + err = __ethtool_set_sg(dev, 0); + if (err) + return err; + } + + return dev->ethtool_ops->set_tx_csum(dev, edata.data); +} + +static int ethtool_get_sg(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + + if (!dev->ethtool_ops->get_sg) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_sg(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (edata.data && + !(dev->features & (NETIF_F_IP_CSUM | + NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM))) + return -EINVAL; + + return __ethtool_set_sg(dev, edata.data); +} + +static int ethtool_get_tso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTSO }; + + if (!dev->ethtool_ops->get_tso) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_tso(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_tso) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (edata.data && !(dev->features & NETIF_F_SG)) + return -EINVAL; + + return dev->ethtool_ops->set_tso(dev, edata.data); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_test test; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->self_test || !ops->self_test_count) + return -EOPNOTSUPP; + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = ops->self_test_count(dev); + data = kmalloc(test.len * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->get_strings) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + switch (gstrings.string_set) { + case ETH_SS_TEST: + if (!ops->self_test_count) + return -EOPNOTSUPP; + gstrings.len = ops->self_test_count(dev); + break; + case ETH_SS_STATS: + if (!ops->get_stats_count) + return -EOPNOTSUPP; + gstrings.len = ops->get_stats_count(dev); + break; + default: + return -EINVAL; + } + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_strings(dev, gstrings.string_set, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->phys_id) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->phys_id(dev, id.data); +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->get_ethtool_stats || !ops->get_stats_count) + return -EOPNOTSUPP; + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = ops->get_stats_count(dev); + data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_ethtool_stats(dev, &stats, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +/* The main entry point in this file. Called from net/core/dev.c */ + +int dev_ethtool(struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + void __user *useraddr = ifr->ifr_data; + u32 ethcmd; + int rc; + + /* + * XXX: This can be pushed down into the ethtool_* handlers that + * need it. Keep existing behaviour for the moment. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (!dev->ethtool_ops) + goto ioctl; + + if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) + return -EFAULT; + + if(dev->ethtool_ops->begin) + if ((rc = dev->ethtool_ops->begin(dev)) < 0) + return rc; + + switch (ethcmd) { + case ETHTOOL_GSET: + rc = ethtool_get_settings(dev, useraddr); + break; + case ETHTOOL_SSET: + rc = ethtool_set_settings(dev, useraddr); + break; + case ETHTOOL_GDRVINFO: + rc = ethtool_get_drvinfo(dev, useraddr); + + break; + case ETHTOOL_GREGS: + rc = ethtool_get_regs(dev, useraddr); + break; + case ETHTOOL_GWOL: + rc = ethtool_get_wol(dev, useraddr); + break; + case ETHTOOL_SWOL: + rc = ethtool_set_wol(dev, useraddr); + break; + case ETHTOOL_GMSGLVL: + rc = ethtool_get_msglevel(dev, useraddr); + break; + case ETHTOOL_SMSGLVL: + rc = ethtool_set_msglevel(dev, useraddr); + break; + case ETHTOOL_NWAY_RST: + rc = ethtool_nway_reset(dev); + break; + case ETHTOOL_GLINK: + rc = ethtool_get_link(dev, useraddr); + break; + case ETHTOOL_GEEPROM: + rc = ethtool_get_eeprom(dev, useraddr); + break; + case ETHTOOL_SEEPROM: + rc = ethtool_set_eeprom(dev, useraddr); + break; + case ETHTOOL_GCOALESCE: + rc = ethtool_get_coalesce(dev, useraddr); + break; + case ETHTOOL_SCOALESCE: + rc = ethtool_set_coalesce(dev, useraddr); + break; + case ETHTOOL_GRINGPARAM: + rc = ethtool_get_ringparam(dev, useraddr); + break; + case ETHTOOL_SRINGPARAM: + rc = ethtool_set_ringparam(dev, useraddr); + break; + case ETHTOOL_GPAUSEPARAM: + rc = ethtool_get_pauseparam(dev, useraddr); + break; + case ETHTOOL_SPAUSEPARAM: + rc = ethtool_set_pauseparam(dev, useraddr); + break; + case ETHTOOL_GRXCSUM: + rc = ethtool_get_rx_csum(dev, useraddr); + break; + case ETHTOOL_SRXCSUM: + rc = ethtool_set_rx_csum(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + rc = ethtool_get_tx_csum(dev, useraddr); + break; + case ETHTOOL_STXCSUM: + rc = ethtool_set_tx_csum(dev, useraddr); + break; + case ETHTOOL_GSG: + rc = ethtool_get_sg(dev, useraddr); + break; + case ETHTOOL_SSG: + rc = ethtool_set_sg(dev, useraddr); + break; + case ETHTOOL_GTSO: + rc = ethtool_get_tso(dev, useraddr); + break; + case ETHTOOL_STSO: + rc = ethtool_set_tso(dev, useraddr); + break; + case ETHTOOL_TEST: + rc = ethtool_self_test(dev, useraddr); + break; + case ETHTOOL_GSTRINGS: + rc = ethtool_get_strings(dev, useraddr); + break; + case ETHTOOL_PHYS_ID: + rc = ethtool_phys_id(dev, useraddr); + break; + case ETHTOOL_GSTATS: + rc = ethtool_get_stats(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if(dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + return rc; + + ioctl: + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, SIOCETHTOOL); + return -EOPNOTSUPP; +} + +EXPORT_SYMBOL(dev_ethtool); +EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL(ethtool_op_get_sg); +EXPORT_SYMBOL(ethtool_op_get_tso); +EXPORT_SYMBOL(ethtool_op_get_tx_csum); +EXPORT_SYMBOL(ethtool_op_set_sg); +EXPORT_SYMBOL(ethtool_op_set_tso); +EXPORT_SYMBOL(ethtool_op_set_tx_csum); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 000000000000..f3b88205ace2 --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,432 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist <jschlst@samba.org> + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/filter.h> + +/* No hurry in this branch */ +static u8 *load_pointer(struct sk_buff *skb, int k) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb->nh.raw + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb->mac.raw + k - SKF_LL_OFF; + + if (ptr >= skb->head && ptr < skb->tail) + return ptr; + return NULL; +} + +/** + * sk_run_filter - run a filter on a socket + * @skb: buffer to run the filter on + * @filter: filter to apply + * @flen: length of filter + * + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +{ + unsigned char *data = skb->data; + /* len is UNSIGNED. Byte wide insns relies only on implicit + type casts to prevent reading arbitrary memory locations. + */ + unsigned int len = skb->len-skb->data_len; + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + + /* + * Process array of filter instructions. + */ + for (pc = 0; pc < flen; pc++) { + fentry = &filter[pc]; + + switch (fentry->code) { + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + case BPF_ALU|BPF_ADD|BPF_K: + A += fentry->k; + continue; + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + case BPF_ALU|BPF_SUB|BPF_K: + A -= fentry->k; + continue; + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + case BPF_ALU|BPF_MUL|BPF_K: + A *= fentry->k; + continue; + case BPF_ALU|BPF_DIV|BPF_X: + if (X == 0) + return 0; + A /= X; + continue; + case BPF_ALU|BPF_DIV|BPF_K: + if (fentry->k == 0) + return 0; + A /= fentry->k; + continue; + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + case BPF_ALU|BPF_AND|BPF_K: + A &= fentry->k; + continue; + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + case BPF_ALU|BPF_OR|BPF_K: + A |= fentry->k; + continue; + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + case BPF_ALU|BPF_LSH|BPF_K: + A <<= fentry->k; + continue; + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + case BPF_ALU|BPF_RSH|BPF_K: + A >>= fentry->k; + continue; + case BPF_ALU|BPF_NEG: + A = -A; + continue; + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; + load_w: + if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { + A = ntohl(*(u32*)&data[k]); + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = ntohl(*(u32*)ptr); + continue; + } + } else { + u32 _tmp, *p; + p = skb_header_pointer(skb, k, 4, &_tmp); + if (p != NULL) { + A = ntohl(*p); + continue; + } + } + return 0; + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; + load_h: + if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { + A = ntohs(*(u16*)&data[k]); + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = ntohs(*(u16*)ptr); + continue; + } + } else { + u16 _tmp, *p; + p = skb_header_pointer(skb, k, 2, &_tmp); + if (p != NULL) { + A = ntohs(*p); + continue; + } + } + return 0; + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; +load_b: + if (k >= 0 && (unsigned int)k < len) { + A = data[k]; + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = *ptr; + continue; + } + } else { + u8 _tmp, *p; + p = skb_header_pointer(skb, k, 1, &_tmp); + if (p != NULL) { + A = *p; + continue; + } + } + return 0; + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + goto load_w; + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + goto load_h; + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + goto load_b; + case BPF_LDX|BPF_B|BPF_MSH: + if (fentry->k >= len) + return 0; + X = (data[fentry->k] & 0xf) << 2; + continue; + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + case BPF_MISC|BPF_TAX: + X = A; + continue; + case BPF_MISC|BPF_TXA: + A = X; + continue; + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + case BPF_RET|BPF_A: + return ((unsigned int)A); + case BPF_ST: + mem[fentry->k] = A; + continue; + case BPF_STX: + mem[fentry->k] = X; + continue; + default: + /* Invalid instruction counts as RET */ + return 0; + } + + /* + * Handle ancillary data, which are impossible + * (or very difficult) to get parsing packet contents. + */ + switch (k-SKF_AD_OFF) { + case SKF_AD_PROTOCOL: + A = htons(skb->protocol); + continue; + case SKF_AD_PKTTYPE: + A = skb->pkt_type; + continue; + case SKF_AD_IFINDEX: + A = skb->dev->ifindex; + continue; + default: + return 0; + } + } + + return 0; +} + +/** + * sk_chk_filter - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal instructions + * and no backward jumps. It must end with a RET instruction + * + * Returns 0 if the rule set is legal or a negative errno code if not. + */ +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) + return -EINVAL; + + /* check the filter code now */ + for (pc = 0; pc < flen; pc++) { + /* all jumps are forward as they are not signed */ + ftest = &filter[pc]; + if (BPF_CLASS(ftest->code) == BPF_JMP) { + /* but they mustn't jump off the end */ + if (BPF_OP(ftest->code) == BPF_JA) { + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + } else { + /* for conditionals both must be safe */ + if (pc + ftest->jt +1 >= flen || + pc + ftest->jf +1 >= flen) + return -EINVAL; + } + } + + /* check that memory operations use valid addresses. */ + if (ftest->k >= BPF_MEMWORDS) { + /* but it might not be a memory operation... */ + switch (ftest->code) { + case BPF_ST: + case BPF_STX: + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + return -EINVAL; + } + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the end + * they _will_ hit this. + */ + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL; +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + return -EINVAL; + + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + + err = sk_chk_filter(fp->insns, fp->len); + if (!err) { + struct sk_filter *old_fp; + + spin_lock_bh(&sk->sk_lock.slock); + old_fp = sk->sk_filter; + sk->sk_filter = fp; + spin_unlock_bh(&sk->sk_lock.slock); + fp = old_fp; + } + + if (fp) + sk_filter_release(sk, fp); + return err; +} + +EXPORT_SYMBOL(sk_chk_filter); +EXPORT_SYMBOL(sk_run_filter); diff --git a/net/core/flow.c b/net/core/flow.c new file mode 100644 index 000000000000..f289570b15a3 --- /dev/null +++ b/net/core/flow.c @@ -0,0 +1,371 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/completion.h> +#include <linux/percpu.h> +#include <linux/bitops.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <net/flow.h> +#include <asm/atomic.h> +#include <asm/semaphore.h> + +struct flow_cache_entry { + struct flow_cache_entry *next; + u16 family; + u8 dir; + struct flowi key; + u32 genid; + void *object; + atomic_t *object_ref; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); + +static u32 flow_hash_shift; +#define flow_hash_size (1 << flow_hash_shift) +static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; + +#define flow_table(cpu) (per_cpu(flow_tables, cpu)) + +static kmem_cache_t *flow_cachep; + +static int flow_lwm, flow_hwm; + +struct flow_percpu_info { + int hash_rnd_recalc; + u32 hash_rnd; + int count; +} ____cacheline_aligned; +static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; + +#define flow_hash_rnd_recalc(cpu) \ + (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) +#define flow_hash_rnd(cpu) \ + (per_cpu(flow_hash_info, cpu).hash_rnd) +#define flow_count(cpu) \ + (per_cpu(flow_hash_info, cpu).count) + +static struct timer_list flow_hash_rnd_timer; + +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +struct flow_flush_info { + atomic_t cpuleft; + struct completion completion; +}; +static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; + +#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) + +static void flow_cache_new_hashrnd(unsigned long arg) +{ + int i; + + for_each_cpu(i) + flow_hash_rnd_recalc(i) = 1; + + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); +} + +static void __flow_cache_shrink(int cpu, int shrink_to) +{ + struct flow_cache_entry *fle, **flp; + int i; + + for (i = 0; i < flow_hash_size; i++) { + int k = 0; + + flp = &flow_table(cpu)[i]; + while ((fle = *flp) != NULL && k < shrink_to) { + k++; + flp = &fle->next; + } + while ((fle = *flp) != NULL) { + *flp = fle->next; + if (fle->object) + atomic_dec(fle->object_ref); + kmem_cache_free(flow_cachep, fle); + flow_count(cpu)--; + } + } +} + +static void flow_cache_shrink(int cpu) +{ + int shrink_to = flow_lwm / flow_hash_size; + + __flow_cache_shrink(cpu, shrink_to); +} + +static void flow_new_hash_rnd(int cpu) +{ + get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); + flow_hash_rnd_recalc(cpu) = 0; + + __flow_cache_shrink(cpu, 0); +} + +static u32 flow_hash_code(struct flowi *key, int cpu) +{ + u32 *k = (u32 *) key; + + return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & + (flow_hash_size - 1)); +} + +#if (BITS_PER_LONG == 64) +typedef u64 flow_compare_t; +#else +typedef u32 flow_compare_t; +#endif + +extern void flowi_is_missized(void); + +/* I hear what you're saying, use memcmp. But memcmp cannot make + * important assumptions that we can here, such as alignment and + * constant size. + */ +static int flow_key_compare(struct flowi *key1, struct flowi *key2) +{ + flow_compare_t *k1, *k1_lim, *k2; + const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); + + if (sizeof(struct flowi) % sizeof(flow_compare_t)) + flowi_is_missized(); + + k1 = (flow_compare_t *) key1; + k1_lim = k1 + n_elem; + + k2 = (flow_compare_t *) key2; + + do { + if (*k1++ != *k2++) + return 1; + } while (k1 < k1_lim); + + return 0; +} + +void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver) +{ + struct flow_cache_entry *fle, **head; + unsigned int hash; + int cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + + fle = NULL; + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ + if (!flow_table(cpu)) + goto nocache; + + if (flow_hash_rnd_recalc(cpu)) + flow_new_hash_rnd(cpu); + hash = flow_hash_code(key, cpu); + + head = &flow_table(cpu)[hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && + flow_key_compare(key, &fle->key) == 0) { + if (fle->genid == atomic_read(&flow_cache_genid)) { + void *ret = fle->object; + + if (ret) + atomic_inc(fle->object_ref); + local_bh_enable(); + + return ret; + } + break; + } + } + + if (!fle) { + if (flow_count(cpu) > flow_hwm) + flow_cache_shrink(cpu); + + fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC); + if (fle) { + fle->next = *head; + *head = fle; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fle->object = NULL; + flow_count(cpu)++; + } + } + +nocache: + { + void *obj; + atomic_t *obj_ref; + + resolver(key, family, dir, &obj, &obj_ref); + + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + + if (fle->object) + atomic_dec(fle->object_ref); + + fle->object = obj; + fle->object_ref = obj_ref; + if (obj) + atomic_inc(fle->object_ref); + } + local_bh_enable(); + + return obj; + } +} + +static void flow_cache_flush_tasklet(unsigned long data) +{ + struct flow_flush_info *info = (void *)data; + int i; + int cpu; + + cpu = smp_processor_id(); + for (i = 0; i < flow_hash_size; i++) { + struct flow_cache_entry *fle; + + fle = flow_table(cpu)[i]; + for (; fle; fle = fle->next) { + unsigned genid = atomic_read(&flow_cache_genid); + + if (!fle->object || fle->genid == genid) + continue; + + fle->object = NULL; + atomic_dec(fle->object_ref); + } + } + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); +static void flow_cache_flush_per_cpu(void *data) +{ + struct flow_flush_info *info = data; + int cpu; + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); + + tasklet = flow_flush_tasklet(cpu); + tasklet->data = (unsigned long)info; + tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ + struct flow_flush_info info; + static DECLARE_MUTEX(flow_flush_sem); + + /* Don't want cpus going down or up during this. */ + lock_cpu_hotplug(); + down(&flow_flush_sem); + atomic_set(&info.cpuleft, num_online_cpus()); + init_completion(&info.completion); + + local_bh_disable(); + smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0); + flow_cache_flush_tasklet((unsigned long)&info); + local_bh_enable(); + + wait_for_completion(&info.completion); + up(&flow_flush_sem); + unlock_cpu_hotplug(); +} + +static void __devinit flow_cache_cpu_prepare(int cpu) +{ + struct tasklet_struct *tasklet; + unsigned long order; + + for (order = 0; + (PAGE_SIZE << order) < + (sizeof(struct flow_cache_entry *)*flow_hash_size); + order++) + /* NOTHING */; + + flow_table(cpu) = (struct flow_cache_entry **) + __get_free_pages(GFP_KERNEL, order); + if (!flow_table(cpu)) + panic("NET: failed to allocate flow cache order %lu\n", order); + + memset(flow_table(cpu), 0, PAGE_SIZE << order); + + flow_hash_rnd_recalc(cpu) = 1; + flow_count(cpu) = 0; + + tasklet = flow_flush_tasklet(cpu); + tasklet_init(tasklet, flow_cache_flush_tasklet, 0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + if (action == CPU_DEAD) + __flow_cache_shrink((unsigned long)hcpu, 0); + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init flow_cache_init(void) +{ + int i; + + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!flow_cachep) + panic("NET: failed to allocate flow cache slab\n"); + + flow_hash_shift = 10; + flow_lwm = 2 * flow_hash_size; + flow_hwm = 4 * flow_hash_size; + + init_timer(&flow_hash_rnd_timer); + flow_hash_rnd_timer.function = flow_cache_new_hashrnd; + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); + + for_each_cpu(i) + flow_cache_cpu_prepare(i); + + hotcpu_notifier(flow_cache_cpu, 0); + return 0; +} + +module_init(flow_cache_init); + +EXPORT_SYMBOL(flow_cache_genid); +EXPORT_SYMBOL(flow_cache_lookup); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 000000000000..b07c029e8219 --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,250 @@ +/* + * net/sched/gen_estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Jamal Hadi Salim - moved it to net/core and reshulfed + * names to make it usable in general net subsystem. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/gen_stats.h> + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<<interval) seconds and evaluate EWMA: + + avrate = avrate*(1-W) + rate*W + + where W is chosen as negative power of 2: W = 2^(-ewma_log) + + The resulting time constant is: + + T = A/(-ln(1-W)) + + + NOTES. + + * The stored value for avbps is scaled by 2^5, so that maximal + rate is ~1Gbit, avpps is scaled by 2^10. + + * Minimal interval is HZ/4=250msec (it is the greatest common divisor + for HZ=100 and HZ=1024 8)), maximal interval + is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals + are too expensive, longer ones can be implemented + at user level painlessly. + */ + +#define EST_MAX_INTERVAL 5 + +struct gen_estimator +{ + struct gen_estimator *next; + struct gnet_stats_basic *bstats; + struct gnet_stats_rate_est *rate_est; + spinlock_t *stats_lock; + unsigned interval; + int ewma_log; + u64 last_bytes; + u32 last_packets; + u32 avpps; + u32 avbps; +}; + +struct gen_estimator_head +{ + struct timer_list timer; + struct gen_estimator *list; +}; + +static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; + +/* Estimator array lock */ +static DEFINE_RWLOCK(est_lock); + +static void est_timer(unsigned long arg) +{ + int idx = (int)arg; + struct gen_estimator *e; + + read_lock(&est_lock); + for (e = elist[idx].list; e; e = e->next) { + u64 nbytes; + u32 npackets; + u32 rate; + + spin_lock(e->stats_lock); + nbytes = e->bstats->bytes; + npackets = e->bstats->packets; + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + e->rate_est->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->rate_est->pps = (e->avpps+0x1FF)>>10; + spin_unlock(e->stats_lock); + } + + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + read_unlock(&est_lock); +} + +/** + * gen_new_estimator - create a new rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est + * as destination. A new timer with the interval specified in the + * configuration TLV is created. Upon each interval, the latest statistics + * will be read from &bstats and the estimated rate will be stored in + * &rate_est with the statistics lock grabed during this period. + * + * Returns 0 on success or a negative error code. + */ +int gen_new_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt) +{ + struct gen_estimator *est; + struct gnet_estimator *parm = RTA_DATA(opt); + + if (RTA_PAYLOAD(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->bstats = bstats; + est->rate_est = rate_est; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = bstats->bytes; + est->avbps = rate_est->bps<<5; + est->last_packets = bstats->packets; + est->avpps = rate_est->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + write_lock_bh(&est_lock); + elist[est->interval].list = est; + write_unlock_bh(&est_lock); + return 0; +} + +/** + * gen_kill_estimator - remove a rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Removes the rate estimator specified by &bstats and &rate_est + * and deletes the timer. + */ +void gen_kill_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est) +{ + int idx; + struct gen_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->rate_est != rate_est || est->bstats != bstats) { + pest = &est->next; + continue; + } + + write_lock_bh(&est_lock); + *pest = est->next; + write_unlock_bh(&est_lock); + + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + +/** + * gen_replace_estimator - replace rate estimator configruation + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + * + * Returns 0 on success or a negative error code. + */ +int +gen_replace_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, + struct rtattr *opt) +{ + gen_kill_estimator(bstats, rate_est); + return gen_new_estimator(bstats, rate_est, stats_lock, opt); +} + + +EXPORT_SYMBOL(gen_kill_estimator); +EXPORT_SYMBOL(gen_new_estimator); +EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 000000000000..8f21490355fa --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,239 @@ +/* + * net/core/gen_stats.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Jamal Hadi Salim + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * See Documentation/networking/gen_stats.txt + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/rtnetlink.h> +#include <linux/gen_stats.h> +#include <net/gen_stats.h> + + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +{ + RTA_PUT(d->skb, type, size, buf); + return 0; + +rtattr_failure: + spin_unlock_bh(d->lock); + return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, + int xstats_type, spinlock_t *lock, struct gnet_dump *d) +{ + memset(d, 0, sizeof(*d)); + + spin_lock_bh(lock); + d->lock = lock; + if (type) + d->tail = (struct rtattr *) skb->tail; + d->skb = skb; + d->compat_tc_stats = tc_stats_type; + d->compat_xstats = xstats_type; + + if (d->tail) + return gnet_stats_copy(d, type, NULL, 0); + + return 0; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d) +{ + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); +} + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) +{ + if (d->compat_tc_stats) { + d->tc_stats.bytes = b->bytes; + d->tc_stats.packets = b->packets; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b)); + + return 0; +} + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @r: rate estimator statistics + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +{ + if (d->compat_tc_stats) { + d->tc_stats.bps = r->bps; + d->tc_stats.pps = r->pps; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + + return 0; +} + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @q: queue statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +{ + if (d->compat_tc_stats) { + d->tc_stats.drops = q->drops; + d->tc_stats.qlen = q->qlen; + d->tc_stats.backlog = q->backlog; + d->tc_stats.overlimits = q->overlimits; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + + return 0; +} + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application sepecific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ + if (d->compat_xstats) { + d->xstats = st; + d->xstats_len = len; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_APP, st, len); + + return 0; +} + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ + if (d->tail) + d->tail->rta_len = d->skb->tail - (u8 *) d->tail; + + if (d->compat_tc_stats) + if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, + sizeof(d->tc_stats)) < 0) + return -1; + + if (d->compat_xstats && d->xstats) { + if (gnet_stats_copy(d, d->compat_xstats, d->xstats, + d->xstats_len) < 0) + return -1; + } + + spin_unlock_bh(d->lock); + return 0; +} + + +EXPORT_SYMBOL(gnet_stats_start_copy); +EXPORT_SYMBOL(gnet_stats_start_copy_compat); +EXPORT_SYMBOL(gnet_stats_copy_basic); +EXPORT_SYMBOL(gnet_stats_copy_rate_est); +EXPORT_SYMBOL(gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_copy_app); +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 000000000000..d57ace949ab8 --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,239 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + * Alexey Kuznetsov: 2.1 optimisations + * Andi Kleen : Fix csum*fromiovecend for IPv6. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <net/checksum.h> +#include <net/sock.h> + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing verify_area. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ + int size, err, ct; + + if (m->msg_namelen) { + if (mode == VERIFY_READ) { + err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + address); + if (err < 0) + return err; + } + m->msg_name = address; + } else { + m->msg_name = NULL; + } + + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, m->msg_iov, size)) + return -EFAULT; + + m->msg_iov = iov; + err = 0; + + for (ct = 0; ct < m->msg_iovlen; ct++) { + err += iov[ct].iov_len; + /* + * Goal is not to verify user data, but to prevent returning + * negative value, which is interpreted as errno. + * Overflow is still possible, but it is harmless. + */ + if (err < 0) + return -EMSGSIZE; + } + + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return 0; +} + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + iov++; + } + + return 0; +} + +/* + * For use with ip_build_xmit + */ +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, + int len) +{ + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov++; + } + + return 0; +} + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) +{ + int csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + + /* There is a remnant from previous iov. */ + if (partial_cnt) { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) { + partial_cnt = copy % 4; + if (partial_cnt) { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} + +EXPORT_SYMBOL(csum_partial_copy_fromiovecend); +EXPORT_SYMBOL(memcpy_fromiovec); +EXPORT_SYMBOL(memcpy_fromiovecend); +EXPORT_SYMBOL(memcpy_toiovec); diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 000000000000..4859b7446c6f --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,137 @@ +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf <sux@loplof.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <asm/types.h> + + +enum lw_bits { + LW_RUNNING = 0, + LW_SE_USED +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(void *dummy); +static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +struct lw_event { + struct list_head list; + struct net_device *dev; +}; + +/* Avoid kmalloc() for most systems */ +static struct lw_event singleevent; + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + LIST_HEAD(head); + struct list_head *n, *next; + + spin_lock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &head); + spin_unlock_irq(&lweventlist_lock); + + list_for_each_safe(n, next, &head) { + struct lw_event *event = list_entry(n, struct lw_event, list); + struct net_device *dev = event->dev; + + if (event == &singleevent) { + clear_bit(LW_SE_USED, &linkwatch_flags); + } else { + kfree(event); + } + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + if (dev->flags & IFF_UP) { + netdev_state_change(dev); + } + + dev_put(dev); + } +} + + +static void linkwatch_event(void *dummy) +{ + /* Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket + */ + linkwatch_nextevent = jiffies + HZ; + clear_bit(LW_RUNNING, &linkwatch_flags); + + rtnl_shlock(); + linkwatch_run_queue(); + rtnl_shunlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + unsigned long flags; + struct lw_event *event; + + if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) { + event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC); + + if (unlikely(event == NULL)) { + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + return; + } + } else { + event = &singleevent; + } + + dev_hold(dev); + event->dev = dev; + + spin_lock_irqsave(&lweventlist_lock, flags); + list_add_tail(&event->list, &lweventlist); + spin_unlock_irqrestore(&lweventlist_lock, flags); + + if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) { + unsigned long thisevent = jiffies; + + if (thisevent >= linkwatch_nextevent) { + schedule_work(&linkwatch_work); + } else { + schedule_delayed_work(&linkwatch_work, linkwatch_nextevent - thisevent); + } + } + } +} + +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 000000000000..0a2f67bbef2e --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,2362 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + * Harald Welte Add neighbour cache statistics like rtstat + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/times.h> +#include <net/neighbour.h> +#include <net/dst.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +#define PNEIGH_HASHMASK 0xF + +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); + +static struct neigh_table *neigh_tables; +static struct file_operations neigh_stat_seq_fops; + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + + The last lock is neigh_tbl_lock. It is pure SMP lock, protecting + list of neighbour tables. This list is used only in process context, + */ + +static DEFINE_RWLOCK(neigh_tbl_lock); + +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (base ? (net_random() % base) + (base >> 1) : 0); +} + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + + NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + + write_lock_bh(&tbl->lock); + for (i = 0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + /* Neighbour record may be discarded if: + * - nobody refers to it. + * - it is not permanent + */ + write_lock(&n->lock); + if (atomic_read(&n->refcnt) == 1 && + !(n->nud_state & NUD_PERMANENT)) { + *np = n->next; + n->dead = 1; + shrunk = 1; + write_unlock(&n->lock); + neigh_release(n); + continue; + } + write_unlock(&n->lock); + np = &n->next; + } + } + + tbl->last_flush = jiffies; + + write_unlock_bh(&tbl->lock); + + return shrunk; +} + +static int neigh_del_timer(struct neighbour *n) +{ + if ((n->nud_state & NUD_IN_TIMER) && + del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + return 0; +} + +static void pneigh_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(list)) != NULL) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + + write_lock_bh(&tbl->lock); + + for (i=0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + write_lock_bh(&n->lock); + n->dead = 1; + neigh_del_timer(n); + write_unlock_bh(&n->lock); + neigh_release(n); + } + } + + write_unlock_bh(&tbl->lock); +} + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + + write_lock_bh(&tbl->lock); + + for (i = 0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np = &tbl->hash_buckets[i]; + + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + write_lock(&n->lock); + neigh_del_timer(n); + n->dead = 1; + + if (atomic_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } + write_unlock(&n->lock); + neigh_release(n); + } + } + + pneigh_ifdown(tbl, dev); + write_unlock_bh(&tbl->lock); + + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + return 0; +} + +static struct neighbour *neigh_alloc(struct neigh_table *tbl) +{ + struct neighbour *n = NULL; + unsigned long now = jiffies; + int entries; + + entries = atomic_inc_return(&tbl->entries) - 1; + if (entries >= tbl->gc_thresh3 || + (entries >= tbl->gc_thresh2 && + time_after(now, tbl->last_flush + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && + entries >= tbl->gc_thresh3) + goto out_entries; + } + + n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); + if (!n) + goto out_entries; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = neigh_parms_clone(&tbl->parms); + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + + NEIGH_CACHE_STAT_INC(tbl, allocs); + n->tbl = tbl; + atomic_set(&n->refcnt, 1); + n->dead = 1; +out: + return n; + +out_entries: + atomic_dec(&tbl->entries); + goto out; +} + +static struct neighbour **neigh_hash_alloc(unsigned int entries) +{ + unsigned long size = entries * sizeof(struct neighbour *); + struct neighbour **ret; + + if (size <= PAGE_SIZE) { + ret = kmalloc(size, GFP_ATOMIC); + } else { + ret = (struct neighbour **) + __get_free_pages(GFP_ATOMIC, get_order(size)); + } + if (ret) + memset(ret, 0, size); + + return ret; +} + +static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +{ + unsigned long size = entries * sizeof(struct neighbour *); + + if (size <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long)hash, get_order(size)); +} + +static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +{ + struct neighbour **new_hash, **old_hash; + unsigned int i, new_hash_mask, old_entries; + + NEIGH_CACHE_STAT_INC(tbl, hash_grows); + + BUG_ON(new_entries & (new_entries - 1)); + new_hash = neigh_hash_alloc(new_entries); + if (!new_hash) + return; + + old_entries = tbl->hash_mask + 1; + new_hash_mask = new_entries - 1; + old_hash = tbl->hash_buckets; + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + for (i = 0; i < old_entries; i++) { + struct neighbour *n, *next; + + for (n = old_hash[i]; n; n = next) { + unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + + hash_val &= new_hash_mask; + next = n->next; + + n->next = new_hash[hash_val]; + new_hash[hash_val] = n; + } + } + tbl->hash_buckets = new_hash; + tbl->hash_mask = new_hash_mask; + + neigh_hash_free(old_hash, old_entries); +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { + neigh_hold(n); + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + read_unlock_bh(&tbl->lock); + return n; +} + +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (!memcmp(n->primary_key, pkey, key_len)) { + neigh_hold(n); + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + read_unlock_bh(&tbl->lock); + return n; +} + +struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + u32 hash_val; + int key_len = tbl->key_len; + int error; + struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + + if (!n) { + rc = ERR_PTR(-ENOBUFS); + goto out; + } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + dev_hold(dev); + + /* Protocol specific setup. */ + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + /* Device specific setup. */ + if (n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + + write_lock_bh(&tbl->lock); + + if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) + neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + + hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + + for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + neigh_hold(n1); + rc = n1; + goto out_tbl_unlock; + } + } + + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->dead = 0; + neigh_hold(n); + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK2("neigh %p is created.\n", n); + rc = n; +out: + return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); +out_neigh_release: + neigh_release(n); + goto out; +} + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + int key_len = tbl->key_len; + u32 hash_val = *(u32 *)(pkey + key_len - 4); + + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + + read_lock_bh(&tbl->lock); + + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (!memcmp(n->key, pkey, key_len) && + (n->dev == dev || !n->dev)) { + read_unlock_bh(&tbl->lock); + goto out; + } + } + read_unlock_bh(&tbl->lock); + n = NULL; + if (!creat) + goto out; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) + goto out; + + memcpy(n->key, pkey, key_len); + n->dev = dev; + if (dev) + dev_hold(dev); + + if (tbl->pconstructor && tbl->pconstructor(n)) { + if (dev) + dev_put(dev); + kfree(n); + n = NULL; + goto out; + } + + write_lock_bh(&tbl->lock); + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); +out: + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + int key_len = tbl->key_len; + u32 hash_val = *(u32 *)(pkey + key_len - 4); + + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h = 0; h <= PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n = *np) != NULL) { + if (!dev || n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + + if (!neigh->dead) { + printk(KERN_WARNING + "Destroying alive neighbour %p\n", neigh); + dump_stack(); + return; + } + + if (neigh_del_timer(neigh)) + printk(KERN_WARNING "Impossible event.\n"); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + write_lock_bh(&hh->hh_lock); + hh->hh_output = neigh_blackhole; + write_unlock_bh(&hh->hh_lock); + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + } + + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + dev_put(neigh->dev); + neigh_parms_put(neigh->parms); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + atomic_dec(&neigh->tbl->entries); + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} + +/* Neighbour state is suspicious; + disable fast path. + + Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + + neigh->output = neigh->ops->output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + + Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + struct neighbour *n, **np; + unsigned long expire, now = jiffies; + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + + write_lock(&tbl->lock); + + /* + * periodically recompute ReachableTime from random function + */ + + if (time_after(now, tbl->last_rand + 300 * HZ)) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p = &tbl->parms; p; p = p->next) + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + } + + np = &tbl->hash_buckets[tbl->hash_chain_gc]; + tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask); + + while ((n = *np) != NULL) { + unsigned int state; + + write_lock(&n->lock); + + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } + + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(now, n->used + n->parms->gc_staletime))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_release(n); + continue; + } + write_unlock(&n->lock); + +next_elt: + np = &n->next; + } + + /* Cycle through all hash buckets every base_reachable_time/2 ticks. + * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 + * base_reachable_time. + */ + expire = tbl->parms.base_reachable_time >> 1; + expire /= (tbl->hash_mask + 1); + if (!expire) + expire = 1; + + mod_timer(&tbl->gc_timer, now + expire); + + write_unlock(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return (n->nud_state & NUD_PROBE ? + p->ucast_probes : + p->ucast_probes + p->app_probes + p->mcast_probes); +} + + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now, next; + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; + + write_lock(&neigh->lock); + + state = neigh->nud_state; + now = jiffies; + next = now + HZ; + + if (!(state & NUD_IN_TIMER)) { +#ifndef CONFIG_SMP + printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); +#endif + goto out; + } + + if (state & NUD_REACHABLE) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->reachable_time)) { + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else if (time_before_eq(now, + neigh->used + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh_suspect(neigh); + next = now + neigh->parms->delay_probe_time; + } else { + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh->nud_state = NUD_STALE; + neigh_suspect(neigh); + } + } else if (state & NUD_DELAY) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh->nud_state = NUD_REACHABLE; + neigh_connect(neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + atomic_set(&neigh->probes, 0); + next = now + neigh->parms->retrans_time; + } + } else { + /* NUD_PROBE|NUD_INCOMPLETE */ + next = now + neigh->parms->retrans_time; + } + + if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && + atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + notify = 1; + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + } + + if (neigh->nud_state & NUD_IN_TIMER) { + neigh_hold(neigh); + if (time_before(next, jiffies + HZ/2)) + next = jiffies + HZ/2; + neigh->timer.expires = next; + add_timer(&neigh->timer); + } + if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb_get(skb); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + if (skb) + kfree_skb(skb); + } else { +out: + write_unlock(&neigh->lock); + } + +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + int rc; + unsigned long now; + + write_lock_bh(&neigh->lock); + + rc = 0; + if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) + goto out_unlock_bh; + + now = jiffies; + + if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + atomic_set(&neigh->probes, neigh->parms->ucast_probes); + neigh->nud_state = NUD_INCOMPLETE; + neigh_hold(neigh); + neigh->timer.expires = now + 1; + add_timer(&neigh->timer); + } else { + neigh->nud_state = NUD_FAILED; + write_unlock_bh(&neigh->lock); + + if (skb) + kfree_skb(skb); + return 1; + } + } else if (neigh->nud_state & NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_hold(neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= + neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.next; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); + } + __skb_queue_tail(&neigh->arp_queue, skb); + } + rc = 1; + } +out_unlock_bh: + write_unlock_bh(&neigh->lock); + return rc; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct net_device*, unsigned char *) = + neigh->dev->header_cache_update; + + if (update) { + for (hh = neigh->hh; hh; hh = hh->hh_next) { + write_lock_bh(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_unlock_bh(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- flags + NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, + if it is different. + NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" + lladdr instead of overriding it + if it is different. + It also allows to retain current state + if lladdr is unchanged. + NEIGH_UPDATE_F_ADMIN means that the change is administrative. + + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NTF_ROUTER flag. + NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as + a router. + + Caller MUST hold reference count on the entry. + */ + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags) +{ + u8 old; + int err; +#ifdef CONFIG_ARPD + int notify = 0; +#endif + struct net_device *dev; + int update_isrouter = 0; + + write_lock_bh(&neigh->lock); + + dev = neigh->dev; + old = neigh->nud_state; + err = -EPERM; + + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; + + if (!(new & NUD_VALID)) { + neigh_del_timer(neigh); + if (old & NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + err = 0; +#ifdef CONFIG_ARPD + notify = old & NUD_VALID; +#endif + goto out; + } + + /* Compare new lladdr with cached one */ + if (!dev->addr_len) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if ((old & NUD_VALID) && + !memcmp(lladdr, neigh->ha, dev->addr_len)) + lladdr = neigh->ha; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + err = -EINVAL; + if (!(old & NUD_VALID)) + goto out; + lladdr = neigh->ha; + } + + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + err = 0; + update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + if (old & NUD_VALID) { + if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { + update_isrouter = 0; + if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && + (old & NUD_CONNECTED)) { + lladdr = neigh->ha; + new = NUD_STALE; + } else + goto out; + } else { + if (lladdr == neigh->ha && new == NUD_STALE && + ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || + (old & NUD_CONNECTED)) + ) + new = old; + } + } + + if (new != old) { + neigh_del_timer(neigh); + if (new & NUD_IN_TIMER) { + neigh_hold(neigh); + neigh->timer.expires = jiffies + + ((new & NUD_REACHABLE) ? + neigh->parms->reachable_time : 0); + add_timer(&neigh->timer); + } + neigh->nud_state = new; + } + + if (lladdr != neigh->ha) { + memcpy(&neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + if (!(new & NUD_CONNECTED)) + neigh->confirmed = jiffies - + (neigh->parms->base_reachable_time << 1); +#ifdef CONFIG_ARPD + notify = 1; +#endif + } + if (new == old) + goto out; + if (new & NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old & NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + write_unlock_bh(&neigh->lock); + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + write_lock_bh(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + } +out: + if (update_isrouter) { + neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? + (neigh->flags | NTF_ROUTER) : + (neigh->flags & ~NTF_ROUTER); + } + write_unlock_bh(&neigh->lock); +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + return err; +} + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct net_device *dev) +{ + struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, + lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_OVERRIDE); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, + u16 protocol) +{ + struct hh_cache *hh; + struct net_device *dev = dst->dev; + + for (hh = n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + rwlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resolution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + int rc = 0; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; + if (dev->hard_header_cache && !dst->hh) { + write_lock_bh(&neigh->lock); + if (!dst->hh) + neigh_hh_init(neigh, dst, dst->ops->protocol); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + write_unlock_bh(&neigh->lock); + } else { + read_lock_bh(&neigh->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + } + if (err >= 0) + rc = neigh->ops->queue_xmit(skb); + else + goto out_kfree_skb; + } +out: + return rc; +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", + dst, dst ? dst->neighbour : NULL); +out_kfree_skb: + rc = -EINVAL; + kfree_skb(skb); + goto out; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + read_lock_bh(&neigh->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + if (err >= 0) + err = neigh->ops->queue_xmit(skb); + else { + err = -EINVAL; + kfree_skb(skb); + } + return err; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb; + + spin_lock(&tbl->proxy_queue.lock); + + skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff *)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + struct net_device *dev = back->dev; + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) + tbl->proxy_redo(back); + else + kfree_skb(back); + + dev_put(dev); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + unsigned long sched_next = now + (net_random() % p->proxy_delay); + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = LOCALLY_ENQUEUED; + skb->stamp.tv_usec = sched_next; + + spin_lock(&tbl->proxy_queue.lock); + if (del_timer(&tbl->proxy_timer)) { + if (time_before(tbl->proxy_timer.expires, sched_next)) + sched_next = tbl->proxy_timer.expires; + } + dst_release(skb->dst); + skb->dst = NULL; + dev_hold(skb->dev); + __skb_queue_tail(&tbl->proxy_queue, skb); + mod_timer(&tbl->proxy_timer, sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl) +{ + struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL); + + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + atomic_set(&p->refcnt, 1); + INIT_RCU_HEAD(&p->rcu_head); + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); + p->next = tbl->parms.next; + tbl->parms.next = p; + write_unlock_bh(&tbl->lock); + } + return p; +} + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ + struct neigh_parms *parms = + container_of(head, struct neigh_parms, rcu_head); + + neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (!parms || parms == &tbl->parms) + return; + write_lock_bh(&tbl->lock); + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + parms->dead = 1; + write_unlock_bh(&tbl->lock); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); + return; + } + } + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK1("neigh_parms_release: not found\n"); +} + +void neigh_parms_destroy(struct neigh_parms *parms) +{ + kfree(parms); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + unsigned long phsize; + + atomic_set(&tbl->parms.refcnt, 1); + INIT_RCU_HEAD(&tbl->parms.rcu_head); + tbl->parms.reachable_time = + neigh_rand_reach_time(tbl->parms.base_reachable_time); + + if (!tbl->kmem_cachep) + tbl->kmem_cachep = kmem_cache_create(tbl->id, + tbl->entry_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!tbl->kmem_cachep) + panic("cannot create neighbour cache"); + + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) + panic("cannot create neighbour cache statistics"); + +#ifdef CONFIG_PROC_FS + tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); + if (!tbl->pde) + panic("cannot create neighbour proc dir entry"); + tbl->pde->proc_fops = &neigh_stat_seq_fops; + tbl->pde->data = tbl; +#endif + + tbl->hash_mask = 1; + tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + + phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); + tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL); + + if (!tbl->hash_buckets || !tbl->phash_buckets) + panic("cannot allocate neighbour cache hashes"); + + memset(tbl->phash_buckets, 0, phsize); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + + rwlock_init(&tbl->lock); + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + 1; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time * 20; + write_lock(&neigh_tbl_lock); + tbl->next = neigh_tables; + neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + /* It is not clean... Fix it to unload IPv6 module safely */ + del_timer_sync(&tbl->gc_timer); + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + printk(KERN_CRIT "neighbour leakage\n"); + write_lock(&neigh_tbl_lock); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } + write_unlock(&neigh_tbl_lock); + + neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); + tbl->hash_buckets = NULL; + + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + + return 0; +} + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -ENODEV; + + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct rtattr *dst_attr = nda[NDA_DST - 1]; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; + if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) + goto out_dev_put; + + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev); + goto out_dev_put; + } + + if (!dev) + goto out; + + n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(n); + } + goto out_dev_put; + } + read_unlock(&neigh_tbl_lock); + err = -EADDRNOTAVAIL; +out_dev_put: + if (dev) + dev_put(dev); +out: + return err; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -ENODEV; + + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1]; + struct rtattr *dst_attr = nda[NDA_DST - 1]; + int override = 1; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; + if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) + goto out_dev_put; + + if (ndm->ndm_flags & NTF_PROXY) { + err = -ENOBUFS; + if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1)) + err = 0; + goto out_dev_put; + } + + err = -EINVAL; + if (!dev) + goto out; + if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len) + goto out_dev_put; + + n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); + if (n) { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(n); + goto out_dev_put; + } + + override = nlh->nlmsg_flags & NLM_F_REPLACE; + } else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out_dev_put; + } else { + n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev); + if (IS_ERR(n)) { + err = PTR_ERR(n); + goto out_dev_put; + } + } + + err = neigh_update(n, + lladdr_attr ? RTA_DATA(lladdr_attr) : NULL, + ndm->ndm_state, + (override ? NEIGH_UPDATE_F_OVERRIDE : 0) | + NEIGH_UPDATE_F_ADMIN); + + neigh_release(n); + goto out_dev_put; + } + + read_unlock(&neigh_tbl_lock); + err = -EADDRNOTAVAIL; +out_dev_put: + if (dev) + dev_put(dev); +out: + return err; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + u32 pid, u32 seq, int event) +{ + unsigned long now = jiffies; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + int locked = 0; + u32 probes; + struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event, + sizeof(struct ndmsg)); + struct ndmsg *ndm = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0; + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + read_lock_bh(&n->lock); + locked = 1; + ndm->ndm_state = n->nud_state; + if (n->nud_state & NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt) - 1; + probes = atomic_read(&n->probes); + read_unlock_bh(&n->lock); + locked = 0; + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + if (locked) + read_unlock_bh(&n->lock); + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct neighbour *n; + int rc, h, s_h = cb->args[1]; + int idx, s_idx = idx = cb->args[2]; + + for (h = 0; h <= tbl->hash_mask; h++) { + if (h < s_h) + continue; + if (h > s_h) + s_idx = 0; + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH) <= 0) { + read_unlock_bh(&tbl->lock); + rc = -1; + goto out; + } + } + read_unlock_bh(&tbl->lock); + } + rc = skb->len; +out: + cb->args[1] = h; + cb->args[2] = idx; + return rc; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neigh_table *tbl; + int t, family, s_t; + + read_lock(&neigh_tbl_lock); + family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; + s_t = cb->args[0]; + + for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args) - + sizeof(cb->args[0])); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + read_unlock(&neigh_tbl_lock); + + cb->args[0] = t; + return skb->len; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ + int chain; + + read_lock_bh(&tbl->lock); + for (chain = 0; chain <= tbl->hash_mask; chain++) { + struct neighbour *n; + + for (n = tbl->hash_buckets[chain]; n; n = n->next) + cb(n, cookie); + } + read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)) +{ + int chain; + + for (chain = 0; chain <= tbl->hash_mask; chain++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[chain]; + while ((n = *np) != NULL) { + int release; + + write_lock(&n->lock); + release = cb(n); + if (release) { + *np = n->next; + n->dead = 1; + } else + np = &n->next; + write_unlock(&n->lock); + if (release) + neigh_release(n); + } + } +} +EXPORT_SYMBOL(__neigh_for_each_release); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + struct neighbour *n = NULL; + int bucket = state->bucket; + + state->flags &= ~NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { + n = tbl->hash_buckets[bucket]; + + while (n) { + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, &fakep); + if (!v) + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + if (n->nud_state & ~NUD_NOARP) + break; + next: + n = n->next; + } + + if (n) + break; + } + state->bucket = bucket; + + return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + } + n = n->next; + + while (1) { + while (n) { + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + + if (n->nud_state & ~NUD_NOARP) + break; + next: + n = n->next; + } + + if (n) + break; + + if (++state->bucket > tbl->hash_mask) + break; + + n = tbl->hash_buckets[state->bucket]; + } + + if (n && pos) + --(*pos); + return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct neighbour *n = neigh_get_first(seq); + + if (n) { + while (*pos) { + n = neigh_get_next(seq, n, pos); + if (!n) + break; + } + } + return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + struct pneigh_entry *pn = NULL; + int bucket = state->bucket; + + state->flags |= NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { + pn = tbl->phash_buckets[bucket]; + if (pn) + break; + } + state->bucket = bucket; + + return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, + struct pneigh_entry *pn, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + pn = pn->next; + while (!pn) { + if (++state->bucket > PNEIGH_HASHMASK) + break; + pn = tbl->phash_buckets[state->bucket]; + if (pn) + break; + } + + if (pn && pos) + --(*pos); + + return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct pneigh_entry *pn = pneigh_get_first(seq); + + if (pn) { + while (*pos) { + pn = pneigh_get_next(seq, pn, pos); + if (!pn) + break; + } + } + return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + void *rc; + + rc = neigh_get_idx(seq, pos); + if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_idx(seq, pos); + + return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) +{ + struct neigh_seq_state *state = seq->private; + loff_t pos_minus_one; + + state->tbl = tbl; + state->bucket = 0; + state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + + read_lock_bh(&tbl->lock); + + pos_minus_one = *pos - 1; + return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_seq_state *state; + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = neigh_get_idx(seq, pos); + goto out; + } + + state = seq->private; + if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { + rc = neigh_get_next(seq, v, NULL); + if (rc) + goto out; + if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_first(seq); + } else { + BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); + rc = pneigh_get_next(seq, v, NULL); + } +out: + ++(*pos); + return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + struct neigh_statistics *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs forced_gc_goal_miss\n"); + return 0; + } + + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx %08lx\n", + atomic_read(&tbl->entries), + + st->allocs, + st->destroys, + st->hash_grows, + + st->lookups, + st->hits, + + st->res_failed, + + st->rcv_probes_mcast, + st->rcv_probes_ucast, + + st->periodic_gc_runs, + st->forced_gc_runs + ); + + return 0; +} + +static struct seq_operations neigh_stat_seq_ops = { + .start = neigh_stat_seq_start, + .next = neigh_stat_seq_next, + .stop = neigh_stat_seq_stop, + .show = neigh_stat_seq_show, +}; + +static int neigh_stat_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &neigh_stat_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +}; + +static struct file_operations neigh_stat_seq_fops = { + .owner = THIS_MODULE, + .open = neigh_stat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); + struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr *)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +static void neigh_app_notify(struct neighbour *n) +{ + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); + struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr *)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +#endif /* CONFIG_ARPD */ + +#ifdef CONFIG_SYSCTL + +static struct neigh_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[__NET_NEIGH_MAX]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + .neigh_vars = { + { + .ctl_name = NET_NEIGH_MCAST_SOLICIT, + .procname = "mcast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_UCAST_SOLICIT, + .procname = "ucast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_APP_SOLICIT, + .procname = "app_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_RETRANS_TIME, + .procname = "retrans_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_REACHABLE_TIME, + .procname = "base_reachable_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_DELAY_PROBE_TIME, + .procname = "delay_first_probe_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_STALE_TIME, + .procname = "gc_stale_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_UNRES_QLEN, + .procname = "unres_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_PROXY_QLEN, + .procname = "proxy_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_ANYCAST_DELAY, + .procname = "anycast_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_PROXY_DELAY, + .procname = "proxy_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_LOCKTIME, + .procname = "locktime", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_INTERVAL, + .procname = "gc_interval", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH1, + .procname = "gc_thresh1", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH2, + .procname = "gc_thresh2", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH3, + .procname = "gc_thresh3", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_RETRANS_TIME_MS, + .procname = "retrans_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_NEIGH_REACHABLE_TIME_MS, + .procname = "base_reachable_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + }, + .neigh_dev = { + { + .ctl_name = NET_PROTO_CONF_DEFAULT, + .procname = "default", + .mode = 0555, + }, + }, + .neigh_neigh_dir = { + { + .procname = "neigh", + .mode = 0555, + }, + }, + .neigh_proto_dir = { + { + .mode = 0555, + }, + }, + .neigh_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + }, + }, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name, + proc_handler *handler, ctl_handler *strategy) +{ + struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + const char *dev_name_source = NULL; + char *dev_name = NULL; + int err = 0; + + if (!t) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[0].data = &p->mcast_probes; + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->base_reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + + if (dev) { + dev_name_source = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex; + t->neigh_vars[12].procname = NULL; + t->neigh_vars[13].procname = NULL; + t->neigh_vars[14].procname = NULL; + t->neigh_vars[15].procname = NULL; + } else { + dev_name_source = t->neigh_dev[0].procname; + t->neigh_vars[12].data = (int *)(p + 1); + t->neigh_vars[13].data = (int *)(p + 1) + 1; + t->neigh_vars[14].data = (int *)(p + 1) + 2; + t->neigh_vars[15].data = (int *)(p + 1) + 3; + } + + t->neigh_vars[16].data = &p->retrans_time; + t->neigh_vars[17].data = &p->base_reachable_time; + + if (handler || strategy) { + /* RetransTime */ + t->neigh_vars[3].proc_handler = handler; + t->neigh_vars[3].strategy = strategy; + t->neigh_vars[3].extra1 = dev; + /* ReachableTime */ + t->neigh_vars[4].proc_handler = handler; + t->neigh_vars[4].strategy = strategy; + t->neigh_vars[4].extra1 = dev; + /* RetransTime (in milliseconds)*/ + t->neigh_vars[16].proc_handler = handler; + t->neigh_vars[16].strategy = strategy; + t->neigh_vars[16].extra1 = dev; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[17].proc_handler = handler; + t->neigh_vars[17].strategy = strategy; + t->neigh_vars[17].extra1 = dev; + } + + dev_name = net_sysctl_strdup(dev_name_source); + if (!dev_name) { + err = -ENOBUFS; + goto free; + } + + t->neigh_dev[0].procname = dev_name; + + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (!t->sysctl_header) { + err = -ENOBUFS; + goto free_procname; + } + p->sysctl_table = t; + return 0; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + + return err; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->neigh_dev[0].procname); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ + +EXPORT_SYMBOL(__neigh_event_send); +EXPORT_SYMBOL(neigh_add); +EXPORT_SYMBOL(neigh_changeaddr); +EXPORT_SYMBOL(neigh_compat_output); +EXPORT_SYMBOL(neigh_connected_output); +EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(neigh_delete); +EXPORT_SYMBOL(neigh_destroy); +EXPORT_SYMBOL(neigh_dump_info); +EXPORT_SYMBOL(neigh_event_ns); +EXPORT_SYMBOL(neigh_ifdown); +EXPORT_SYMBOL(neigh_lookup); +EXPORT_SYMBOL(neigh_lookup_nodev); +EXPORT_SYMBOL(neigh_parms_alloc); +EXPORT_SYMBOL(neigh_parms_release); +EXPORT_SYMBOL(neigh_rand_reach_time); +EXPORT_SYMBOL(neigh_resolve_output); +EXPORT_SYMBOL(neigh_table_clear); +EXPORT_SYMBOL(neigh_table_init); +EXPORT_SYMBOL(neigh_update); +EXPORT_SYMBOL(neigh_update_hhs); +EXPORT_SYMBOL(pneigh_enqueue); +EXPORT_SYMBOL(pneigh_lookup); + +#ifdef CONFIG_ARPD +EXPORT_SYMBOL(neigh_app_ns); +#endif +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(neigh_sysctl_register); +EXPORT_SYMBOL(neigh_sysctl_unregister); +#endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 000000000000..060f703659e8 --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,461 @@ +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/wireless.h> + +#define to_class_dev(obj) container_of(obj,struct class_device,kobj) +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +static const char fmt_hex[] = "%#x\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_ulong[] = "%lu\n"; + +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state == NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct class_device *cd, char *buf, + ssize_t (*format)(const struct net_device *, char *)) +{ + struct net_device *net = to_net_dev(cd); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = (*format)(net, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string) \ +static ssize_t format_##field(const struct net_device *net, char *buf) \ +{ \ + return sprintf(buf, format_string, net->field); \ +} \ +static ssize_t show_##field(struct class_device *cd, char *buf) \ +{ \ + return netdev_show(cd, buf, format_##field); \ +} + + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct class_device *dev, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *net = to_net_dev(dev); + char *endp; + unsigned long new; + int ret = -EINVAL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + new = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto err; + + rtnl_lock(); + if (dev_isalive(net)) { + if ((ret = (*set)(net, new)) == 0) + ret = len; + } + rtnl_unlock(); + err: + return ret; +} + +/* generate a read-only network device class attribute */ +#define NETDEVICE_ATTR(field, format_string) \ +NETDEVICE_SHOW(field, format_string) \ +static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \ + +NETDEVICE_ATTR(addr_len, fmt_dec); +NETDEVICE_ATTR(iflink, fmt_dec); +NETDEVICE_ATTR(ifindex, fmt_dec); +NETDEVICE_ATTR(features, fmt_hex); +NETDEVICE_ATTR(type, fmt_dec); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t format_addr(char *buf, const unsigned char *addr, int len) +{ + int i; + char *cp = buf; + + for (i = 0; i < len; i++) + cp += sprintf(cp, "%02x%c", addr[i], + i == (len - 1) ? '\n' : ':'); + return cp - buf; +} + +static ssize_t show_address(struct class_device *dev, char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = format_addr(buf, net->dev_addr, net->addr_len); + read_unlock(&dev_base_lock); + return ret; +} + +static ssize_t show_broadcast(struct class_device *dev, char *buf) +{ + struct net_device *net = to_net_dev(dev); + if (dev_isalive(net)) + return format_addr(buf, net->broadcast, net->addr_len); + return -EINVAL; +} + +static ssize_t show_carrier(struct class_device *dev, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + if (netif_running(netdev)) { + return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + return -EINVAL; +} + +static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL); +static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); + +/* read-write attributes */ +NETDEVICE_SHOW(mtu, fmt_dec); + +static int change_mtu(struct net_device *net, unsigned long new_mtu) +{ + return dev_set_mtu(net, (int) new_mtu); +} + +static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_mtu); +} + +static CLASS_DEVICE_ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu); + +NETDEVICE_SHOW(flags, fmt_hex); + +static int change_flags(struct net_device *net, unsigned long new_flags) +{ + return dev_change_flags(net, (unsigned) new_flags); +} + +static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_flags); +} + +static CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags); + +NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + +static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +{ + net->tx_queue_len = new_len; + return 0; +} + +static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_tx_queue_len); +} + +static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, + store_tx_queue_len); + + +static struct class_device_attribute *net_class_attributes[] = { + &class_device_attr_ifindex, + &class_device_attr_iflink, + &class_device_attr_addr_len, + &class_device_attr_tx_queue_len, + &class_device_attr_features, + &class_device_attr_mtu, + &class_device_attr_flags, + &class_device_attr_type, + &class_device_attr_address, + &class_device_attr_broadcast, + &class_device_attr_carrier, + NULL +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct class_device *cd, char *buf, + unsigned long offset) +{ + struct net_device *dev = to_net_dev(cd); + struct net_device_stats *stats; + ssize_t ret = -EINVAL; + + if (offset > sizeof(struct net_device_stats) || + offset % sizeof(unsigned long) != 0) + WARN_ON(1); + + read_lock(&dev_base_lock); + if (dev_isalive(dev) && dev->get_stats && + (stats = (*dev->get_stats)(dev))) + ret = sprintf(buf, fmt_ulong, + *(unsigned long *)(((u8 *) stats) + offset)); + + read_unlock(&dev_base_lock); + return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name) \ +static ssize_t show_##name(struct class_device *cd, char *buf) \ +{ \ + return netstat_show(cd, buf, \ + offsetof(struct net_device_stats, name)); \ +} \ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); + +static struct attribute *netstat_attrs[] = { + &class_device_attr_rx_packets.attr, + &class_device_attr_tx_packets.attr, + &class_device_attr_rx_bytes.attr, + &class_device_attr_tx_bytes.attr, + &class_device_attr_rx_errors.attr, + &class_device_attr_tx_errors.attr, + &class_device_attr_rx_dropped.attr, + &class_device_attr_tx_dropped.attr, + &class_device_attr_multicast.attr, + &class_device_attr_collisions.attr, + &class_device_attr_rx_length_errors.attr, + &class_device_attr_rx_over_errors.attr, + &class_device_attr_rx_crc_errors.attr, + &class_device_attr_rx_frame_errors.attr, + &class_device_attr_rx_fifo_errors.attr, + &class_device_attr_rx_missed_errors.attr, + &class_device_attr_tx_aborted_errors.attr, + &class_device_attr_tx_carrier_errors.attr, + &class_device_attr_tx_fifo_errors.attr, + &class_device_attr_tx_heartbeat_errors.attr, + &class_device_attr_tx_window_errors.attr, + &class_device_attr_rx_compressed.attr, + &class_device_attr_tx_compressed.attr, + NULL +}; + + +static struct attribute_group netstat_group = { + .name = "statistics", + .attrs = netstat_attrs, +}; + +#ifdef WIRELESS_EXT +/* helper function that does all the locking etc for wireless stats */ +static ssize_t wireless_show(struct class_device *cd, char *buf, + ssize_t (*format)(const struct iw_statistics *, + char *)) +{ + struct net_device *dev = to_net_dev(cd); + const struct iw_statistics *iw; + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(dev) && dev->get_wireless_stats + && (iw = dev->get_wireless_stats(dev)) != NULL) + ret = (*format)(iw, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* show function template for wireless fields */ +#define WIRELESS_SHOW(name, field, format_string) \ +static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ +{ \ + return sprintf(buf, format_string, iw->field); \ +} \ +static ssize_t show_iw_##name(struct class_device *cd, char *buf) \ +{ \ + return wireless_show(cd, buf, format_iw_##name); \ +} \ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) + +WIRELESS_SHOW(status, status, fmt_hex); +WIRELESS_SHOW(link, qual.qual, fmt_dec); +WIRELESS_SHOW(level, qual.level, fmt_dec); +WIRELESS_SHOW(noise, qual.noise, fmt_dec); +WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); +WIRELESS_SHOW(crypt, discard.code, fmt_dec); +WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); +WIRELESS_SHOW(misc, discard.misc, fmt_dec); +WIRELESS_SHOW(retries, discard.retries, fmt_dec); +WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); + +static struct attribute *wireless_attrs[] = { + &class_device_attr_status.attr, + &class_device_attr_link.attr, + &class_device_attr_level.attr, + &class_device_attr_noise.attr, + &class_device_attr_nwid.attr, + &class_device_attr_crypt.attr, + &class_device_attr_fragment.attr, + &class_device_attr_retries.attr, + &class_device_attr_misc.attr, + &class_device_attr_beacon.attr, + NULL +}; + +static struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; +#endif + +#ifdef CONFIG_HOTPLUG +static int netdev_hotplug(struct class_device *cd, char **envp, + int num_envp, char *buf, int size) +{ + struct net_device *dev = to_net_dev(cd); + int i = 0; + int n; + + /* pass interface in env to hotplug. */ + envp[i++] = buf; + n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1; + buf += n; + size -= n; + + if ((size <= 0) || (i >= num_envp)) + return -ENOMEM; + + envp[i] = NULL; + return 0; +} +#endif + +/* + * netdev_release -- destroy and free a dead device. + * Called when last reference to class_device kobject is gone. + */ +static void netdev_release(struct class_device *cd) +{ + struct net_device *dev + = container_of(cd, struct net_device, class_dev); + + BUG_ON(dev->reg_state != NETREG_RELEASED); + + kfree((char *)dev - dev->padded); +} + +static struct class net_class = { + .name = "net", + .release = netdev_release, +#ifdef CONFIG_HOTPLUG + .hotplug = netdev_hotplug, +#endif +}; + +void netdev_unregister_sysfs(struct net_device * net) +{ + struct class_device * class_dev = &(net->class_dev); + + if (net->get_stats) + sysfs_remove_group(&class_dev->kobj, &netstat_group); + +#ifdef WIRELESS_EXT + if (net->get_wireless_stats) + sysfs_remove_group(&class_dev->kobj, &wireless_group); +#endif + class_device_del(class_dev); + +} + +/* Create sysfs entries for network device. */ +int netdev_register_sysfs(struct net_device *net) +{ + struct class_device *class_dev = &(net->class_dev); + int i; + struct class_device_attribute *attr; + int ret; + + class_dev->class = &net_class; + class_dev->class_data = net; + + strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); + if ((ret = class_device_register(class_dev))) + goto out; + + for (i = 0; (attr = net_class_attributes[i]) != NULL; i++) { + if ((ret = class_device_create_file(class_dev, attr))) + goto out_unreg; + } + + + if (net->get_stats && + (ret = sysfs_create_group(&class_dev->kobj, &netstat_group))) + goto out_unreg; + +#ifdef WIRELESS_EXT + if (net->get_wireless_stats && + (ret = sysfs_create_group(&class_dev->kobj, &wireless_group))) + goto out_cleanup; + + return 0; +out_cleanup: + if (net->get_stats) + sysfs_remove_group(&class_dev->kobj, &netstat_group); +#else + return 0; +#endif + +out_unreg: + printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", + net->name, ret); + class_device_unregister(class_dev); +out: + return ret; +} + +int netdev_sysfs_init(void) +{ + return class_register(&net_class); +} diff --git a/net/core/netfilter.c b/net/core/netfilter.c new file mode 100644 index 000000000000..e51cfa46950c --- /dev/null +++ b/net/core/netfilter.c @@ -0,0 +1,799 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + * + * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000: Added NF_REPEAT --RR. + * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/sock.h> +#include <net/route.h> +#include <linux/ip.h> + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE. We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DECLARE_MUTEX(nf_sockopt_mutex); + +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +static LIST_HEAD(nf_sockopts); +static DEFINE_SPINLOCK(nf_hook_lock); + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static struct nf_queue_handler_t { + nf_queue_outfn_t outfn; + void *data; +} queue_handler[NPROTO]; +static DEFINE_RWLOCK(queue_handler_lock); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); + list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } + list_add_rcu(®->list, i->prev); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); + return 0; +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + spin_lock_bh(&nf_hook_lock); + list_del_rcu(®->list); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); +} + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct list_head *i; + int ret = 0; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + up(&nf_sockopt_mutex); + return ret; +} + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + /* No point being interruptible: we're probably in cleanup_module() */ + restart: + down(&nf_sockopt_mutex); + if (reg->use != 0) { + /* To be woken by nf_sockopt call... */ + /* FIXME: Stuart Young's name appears gratuitously. */ + set_current_state(TASK_UNINTERRUPTIBLE); + reg->cleanup_task = current; + up(&nf_sockopt_mutex); + schedule(); + goto restart; + } + list_del(®->list); + up(&nf_sockopt_mutex); +} + +#ifdef CONFIG_NETFILTER_DEBUG +#include <net/ip.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4.h> + +static void debug_print_hooks_ip(unsigned int nf_debug) +{ + if (nf_debug & (1 << NF_IP_PRE_ROUTING)) { + printk("PRE_ROUTING "); + nf_debug ^= (1 << NF_IP_PRE_ROUTING); + } + if (nf_debug & (1 << NF_IP_LOCAL_IN)) { + printk("LOCAL_IN "); + nf_debug ^= (1 << NF_IP_LOCAL_IN); + } + if (nf_debug & (1 << NF_IP_FORWARD)) { + printk("FORWARD "); + nf_debug ^= (1 << NF_IP_FORWARD); + } + if (nf_debug & (1 << NF_IP_LOCAL_OUT)) { + printk("LOCAL_OUT "); + nf_debug ^= (1 << NF_IP_LOCAL_OUT); + } + if (nf_debug & (1 << NF_IP_POST_ROUTING)) { + printk("POST_ROUTING "); + nf_debug ^= (1 << NF_IP_POST_ROUTING); + } + if (nf_debug) + printk("Crap bits: 0x%04X", nf_debug); + printk("\n"); +} + +static void nf_dump_skb(int pf, struct sk_buff *skb) +{ + printk("skb: pf=%i %s dev=%s len=%u\n", + pf, + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? skb->dev->name : "(no dev)", + skb->len); + switch (pf) { + case PF_INET: { + const struct iphdr *ip = skb->nh.iph; + __u32 *opt = (__u32 *) (ip + 1); + int opti; + __u16 src_port = 0, dst_port = 0; + + if (ip->protocol == IPPROTO_TCP + || ip->protocol == IPPROTO_UDP) { + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + src_port = ntohs(tcp->source); + dst_port = ntohs(tcp->dest); + } + + printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu" + " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ip->protocol, NIPQUAD(ip->saddr), + src_port, NIPQUAD(ip->daddr), + dst_port, + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ntohs(ip->frag_off), ip->ttl); + + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk("\n"); + } + } +} + +void nf_debug_ip_local_deliver(struct sk_buff *skb) +{ + /* If it's a loopback packet, it must have come through + * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and + * NF_IP_LOCAL_IN. Otherwise, must have gone through + * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */ + if (!skb->dev) { + printk("ip_local_deliver: skb->dev is NULL.\n"); + } + else if (strcmp(skb->dev->name, "lo") == 0) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING) + | (1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_LOCAL_IN))) { + printk("ip_local_deliver: bad loopback skb: "); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } + else { + if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING) + | (1<<NF_IP_LOCAL_IN))) { + printk("ip_local_deliver: bad non-lo skb: "); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } +} + +void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) +{ + if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_dev_loopback_xmit: bad owned skb = %p: ", + newskb); + debug_print_hooks_ip(newskb->nf_debug); + nf_dump_skb(PF_INET, newskb); + } + /* Clear to avoid confusing input check */ + newskb->nf_debug = 0; +} + +void nf_debug_ip_finish_output2(struct sk_buff *skb) +{ + /* If it's owned, it must have gone through the + * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING. + * Otherwise, must have gone through + * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING. + */ + if (skb->sk) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_finish_output: bad owned skb = %p: ", skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } else { + if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_POST_ROUTING))) { + /* Fragments, entunnelled packets, TCP RSTs + generated by ipt_REJECT will have no + owners, but still may be local */ + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))){ + printk("ip_finish_output:" + " bad unowned skb = %p: ",skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } + } +} +#endif /*CONFIG_NETFILTER_DEBUG*/ + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val, + char __user *opt, int *len, int get) +{ + struct list_head *i; + struct nf_sockopt_ops *ops; + int ret; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + ops = (struct nf_sockopt_ops *)i; + if (ops->pf == pf) { + if (get) { + if (val >= ops->get_optmin + && val < ops->get_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->get(sk, val, opt, len); + goto out; + } + } else { + if (val >= ops->set_optmin + && val < ops->set_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->set(sk, val, opt, *len); + goto out; + } + } + } + } + up(&nf_sockopt_mutex); + return -ENOPROTOOPT; + + out: + down(&nf_sockopt_mutex); + ops->use--; + if (ops->cleanup_task) + wake_up_process(ops->cleanup_task); + up(&nf_sockopt_mutex); + return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, + int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} + +int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} + +static unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely(verdict > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + *i = (*i)->prev; + } + } + return NF_ACCEPT; +} + +int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) +{ + int ret; + + write_lock_bh(&queue_handler_lock); + if (queue_handler[pf].outfn) + ret = -EBUSY; + else { + queue_handler[pf].outfn = outfn; + queue_handler[pf].data = data; + ret = 0; + } + write_unlock_bh(&queue_handler_lock); + + return ret; +} + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(int pf) +{ + write_lock_bh(&queue_handler_lock); + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + write_unlock_bh(&queue_handler_lock); + + return 0; +} + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +static int nf_queue(struct sk_buff *skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + int status; + struct nf_info *info; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev = NULL; + struct net_device *physoutdev = NULL; +#endif + + /* QUEUE == DROP if noone is waiting, to be safe. */ + read_lock(&queue_handler_lock); + if (!queue_handler[pf].outfn) { + read_unlock(&queue_handler_lock); + kfree_skb(skb); + return 1; + } + + info = kmalloc(sizeof(*info), GFP_ATOMIC); + if (!info) { + if (net_ratelimit()) + printk(KERN_ERR "OOM queueing packet %p\n", + skb); + read_unlock(&queue_handler_lock); + kfree_skb(skb); + return 1; + } + + *info = (struct nf_info) { + (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(info->elem->owner)) { + read_unlock(&queue_handler_lock); + kfree(info); + return 0; + } + + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) dev_hold(indev); + if (outdev) dev_hold(outdev); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + physindev = skb->nf_bridge->physindev; + if (physindev) dev_hold(physindev); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev) dev_hold(physoutdev); + } +#endif + + status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); + read_unlock(&queue_handler_lock); + + if (status < 0) { + /* James M doesn't say fuck enough. */ + if (indev) dev_put(indev); + if (outdev) dev_put(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (physindev) dev_put(physindev); + if (physoutdev) dev_put(physoutdev); +#endif + module_put(info->elem->owner); + kfree(info); + kfree_skb(skb); + return 1; + } + return 1; +} + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((*pskb)->nf_debug & (1 << hook))) { + printk("nf_hook: hook %i already set.\n", hook); + nf_dump_skb(pf, *pskb); + } + (*pskb)->nf_debug |= (1 << hook); +#endif + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + goto unlock; + } else if (verdict == NF_DROP) { + kfree_skb(*pskb); + ret = -EPERM; + } else if (verdict == NF_QUEUE) { + NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) + goto next_hook; + } +unlock: + rcu_read_unlock(); + return ret; +} + +void nf_reinject(struct sk_buff *skb, struct nf_info *info, + unsigned int verdict) +{ + struct list_head *elem = &info->elem->list; + struct list_head *i; + + rcu_read_lock(); + + /* Release those devices we held, or Alexey will kill me. */ + if (info->indev) dev_put(info->indev); + if (info->outdev) dev_put(info->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + if (skb->nf_bridge->physindev) + dev_put(skb->nf_bridge->physindev); + if (skb->nf_bridge->physoutdev) + dev_put(skb->nf_bridge->physoutdev); + } +#endif + + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + + list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + + if (elem == &nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); + verdict = NF_DROP; + } + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); + } + + switch (verdict) { + case NF_ACCEPT: + info->okfn(skb); + break; + + case NF_QUEUE: + if (!nf_queue(skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn)) + goto next_hook; + break; + } + rcu_read_unlock(); + + if (verdict == NF_DROP) + kfree_skb(skb); + + kfree(info); + return; +} + +#ifdef CONFIG_INET +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) +{ + struct sk_buff *nskb; + + if (writable_len > (*pskb)->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (skb_shared(*pskb) || skb_cloned(*pskb)) + goto copy_skb; + + return pskb_may_pull(*pskb, writable_len); + +copy_skb: + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return 0; + BUG_ON(skb_is_nonlinear(nskb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} +EXPORT_SYMBOL(skb_ip_make_writable); +#endif /*CONFIG_INET*/ + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 + +static nf_logfn *nf_logging[NPROTO]; /* = NULL */ +static int reported = 0; +static DEFINE_SPINLOCK(nf_log_lock); + +int nf_log_register(int pf, nf_logfn *logfn) +{ + int ret = -EBUSY; + + /* Any setup of logging members must be done before + * substituting pointer. */ + spin_lock(&nf_log_lock); + if (!nf_logging[pf]) { + rcu_assign_pointer(nf_logging[pf], logfn); + ret = 0; + } + spin_unlock(&nf_log_lock); + return ret; +} + +void nf_log_unregister(int pf, nf_logfn *logfn) +{ + spin_lock(&nf_log_lock); + if (nf_logging[pf] == logfn) + nf_logging[pf] = NULL; + spin_unlock(&nf_log_lock); + + /* Give time to concurrent readers. */ + synchronize_net(); +} + +void nf_log_packet(int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + nf_logfn *logfn; + + rcu_read_lock(); + logfn = rcu_dereference(nf_logging[pf]); + if (logfn) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + /* We must read logging before nf_logfn[pf] */ + logfn(hooknum, skb, in, out, prefix); + } else if (!reported) { + printk(KERN_WARNING "nf_log_packet: can\'t log yet, " + "no backend logging module loaded in!\n"); + reported++; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_register); +EXPORT_SYMBOL(nf_log_unregister); +EXPORT_SYMBOL(nf_log_packet); + +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct && (attach = ip_ct_attach) != NULL) { + mb(); /* Just to be sure: must be read before executing this */ + attach(new, skb); + } +} + +void __init netfilter_init(void) +{ + int i, h; + + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } +} + +EXPORT_SYMBOL(ip_ct_attach); +EXPORT_SYMBOL(nf_ct_attach); +EXPORT_SYMBOL(nf_getsockopt); +EXPORT_SYMBOL(nf_hook_slow); +EXPORT_SYMBOL(nf_hooks); +EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_queue_handler); +EXPORT_SYMBOL(nf_register_sockopt); +EXPORT_SYMBOL(nf_reinject); +EXPORT_SYMBOL(nf_setsockopt); +EXPORT_SYMBOL(nf_unregister_hook); +EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 000000000000..a119696d5521 --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,735 @@ +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003 Matt Mackall <mpm@selenic.com> + * + * based on the netconsole code from: + * + * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Red Hat, Inc. + */ + +#include <linux/smp_lock.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/unaligned.h> + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 +#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) + +static DEFINE_SPINLOCK(skb_list_lock); +static int nr_skbs; +static struct sk_buff *skbs; + +static DEFINE_SPINLOCK(queue_lock); +static int queue_depth; +static struct sk_buff *queue_head, *queue_tail; + +static atomic_t trapped; + +#define NETPOLL_RX_ENABLED 1 +#define NETPOLL_RX_DROP 2 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void zap_completion_queue(void); + +static void queue_process(void *p) +{ + unsigned long flags; + struct sk_buff *skb; + + while (queue_head) { + spin_lock_irqsave(&queue_lock, flags); + + skb = queue_head; + queue_head = skb->next; + if (skb == queue_tail) + queue_head = NULL; + + queue_depth--; + + spin_unlock_irqrestore(&queue_lock, flags); + + dev_queue_xmit(skb); + } +} + +static DECLARE_WORK(send_queue, queue_process, NULL); + +void netpoll_queue(struct sk_buff *skb) +{ + unsigned long flags; + + if (queue_depth == MAX_QUEUE_DEPTH) { + __kfree_skb(skb); + return; + } + + spin_lock_irqsave(&queue_lock, flags); + if (!queue_head) + queue_head = skb; + else + queue_tail->next = skb; + queue_tail = skb; + queue_depth++; + spin_unlock_irqrestore(&queue_lock, flags); + + schedule_work(&send_queue); +} + +static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) + return 0; + + if (skb->ip_summed == CHECKSUM_HW) + return csum_tcpudp_magic( + saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +} + +/* + * Check whether delayed processing was scheduled for our NIC. If so, + * we attempt to grab the poll lock and use ->poll() to pump the card. + * If this fails, either we've recursed in ->poll() or it's already + * running on another CPU. + * + * Note: we don't mask interrupts with this lock because we're using + * trylock here and interrupts are already disabled in the softirq + * case. Further, we test the poll_owner to avoid recursion on UP + * systems where the lock doesn't exist. + * + * In cases where there is bi-directional communications, reading only + * one message at a time can lead to packets being dropped by the + * network adapter, forcing superfluous retries and possibly timeouts. + * Thus, we set our budget to greater than 1. + */ +static void poll_napi(struct netpoll *np) +{ + int budget = 16; + + if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && + np->poll_owner != smp_processor_id() && + spin_trylock(&np->poll_lock)) { + np->rx_flags |= NETPOLL_RX_DROP; + atomic_inc(&trapped); + + np->dev->poll(np->dev, &budget); + + atomic_dec(&trapped); + np->rx_flags &= ~NETPOLL_RX_DROP; + spin_unlock(&np->poll_lock); + } +} + +void netpoll_poll(struct netpoll *np) +{ + if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller) + return; + + /* Process pending work on NIC */ + np->dev->poll_controller(np->dev); + if (np->dev->poll) + poll_napi(np); + + zap_completion_queue(); +} + +static void refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&skb_list_lock, flags); + while (nr_skbs < MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + + skb->next = skbs; + skbs = skb; + nr_skbs++; + } + spin_unlock_irqrestore(&skb_list_lock, flags); +} + +static void zap_completion_queue(void) +{ + unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if(skb->destructor) + dev_kfree_skb_any(skb); /* put this one back */ + else + __kfree_skb(skb); + } + } + + put_cpu_var(softnet_data); +} + +static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) +{ + int once = 1, count = 0; + unsigned long flags; + struct sk_buff *skb = NULL; + + zap_completion_queue(); +repeat: + if (nr_skbs < MAX_SKBS) + refill_skbs(); + + skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) { + spin_lock_irqsave(&skb_list_lock, flags); + skb = skbs; + if (skb) { + skbs = skb->next; + skb->next = NULL; + nr_skbs--; + } + spin_unlock_irqrestore(&skb_list_lock, flags); + } + + if(!skb) { + count++; + if (once && (count == 1000000)) { + printk("out of netpoll skbs!\n"); + once = 0; + } + netpoll_poll(np); + goto repeat; + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, reserve); + return skb; +} + +static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + int status; + +repeat: + if(!np || !np->dev || !netif_running(np->dev)) { + __kfree_skb(skb); + return; + } + + /* avoid recursion */ + if(np->poll_owner == smp_processor_id() || + np->dev->xmit_lock_owner == smp_processor_id()) { + if (np->drop) + np->drop(skb); + else + __kfree_skb(skb); + return; + } + + spin_lock(&np->dev->xmit_lock); + np->dev->xmit_lock_owner = smp_processor_id(); + + /* + * network drivers do not expect to be called if the queue is + * stopped. + */ + if (netif_queue_stopped(np->dev)) { + np->dev->xmit_lock_owner = -1; + spin_unlock(&np->dev->xmit_lock); + + netpoll_poll(np); + goto repeat; + } + + status = np->dev->hard_start_xmit(skb, np->dev); + np->dev->xmit_lock_owner = -1; + spin_unlock(&np->dev->xmit_lock); + + /* transmit busy */ + if(status) { + netpoll_poll(np); + goto repeat; + } +} + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, eth_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = len + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; + + skb = find_skb(np, total_len, total_len - len); + if (!skb) + return; + + memcpy(skb->data, msg, len); + skb->len += len; + + udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + udph->check = 0; + + iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(htonl(np->local_ip), &(iph->saddr)); + put_unaligned(htonl(np->remote_ip), &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, np->local_mac, 6); + memcpy(eth->h_dest, np->remote_mac, 6); + + skb->dev = np->dev; + + netpoll_send_skb(np, skb); +} + +static void arp_reply(struct sk_buff *skb) +{ + struct arphdr *arp; + unsigned char *arp_ptr; + int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; + u32 sip, tip; + struct sk_buff *send_skb; + struct netpoll *np = skb->dev->np; + + if (!np) return; + + /* No arp on this interface */ + if (skb->dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, (sizeof(struct arphdr) + + (2 * skb->dev->addr_len) + + (2 * sizeof(u32))))) + return; + + skb->h.raw = skb->nh.raw = skb->data; + arp = skb->nh.arph; + + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_op != htons(ARPOP_REQUEST)) + return; + + arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4 + skb->dev->addr_len; + memcpy(&tip, arp_ptr, 4); + + /* Should we ignore arp? */ + if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip)) + return; + + size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4); + send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + + if (!send_skb) + return; + + send_skb->nh.raw = send_skb->data; + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); + + /* Fill the device header for the ARP frame */ + + if (np->dev->hard_header && + np->dev->hard_header(send_skb, skb->dev, ptype, + np->remote_mac, np->local_mac, + send_skb->len) < 0) { + kfree_skb(send_skb); + return; + } + + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, np->remote_mac, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); +} + +int __netpoll_rx(struct sk_buff *skb) +{ + int proto, len, ulen; + struct iphdr *iph; + struct udphdr *uh; + struct netpoll *np = skb->dev->np; + + if (!np->rx_hook) + goto out; + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + /* check if netpoll clients need ARP */ + if (skb->protocol == __constant_htons(ETH_P_ARP) && + atomic_read(&trapped)) { + arp_reply(skb); + return 1; + } + + proto = ntohs(eth_hdr(skb)->h_proto); + if (proto != ETH_P_IP) + goto out; + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + if (skb_shared(skb)) + goto out; + + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + if (iph->ihl < 5 || iph->version != 4) + goto out; + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + if (iph->protocol != IPPROTO_UDP) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len) + goto out; + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + goto out; + if (np->local_ip && np->local_ip != ntohl(iph->daddr)) + goto out; + if (np->remote_ip && np->remote_ip != ntohl(iph->saddr)) + goto out; + if (np->local_port && np->local_port != ntohs(uh->dest)) + goto out; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + + kfree_skb(skb); + return 1; + +out: + if (atomic_read(&trapped)) { + kfree_skb(skb); + return 1; + } + + return 0; +} + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ + char *cur=opt, *delim; + + if(*cur != '@') { + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim=0; + np->local_port=simple_strtol(cur, NULL, 10); + cur=delim; + } + cur++; + printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port); + + if(*cur != '/') { + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim=0; + np->local_ip=ntohl(in_aton(cur)); + cur=delim; + + printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->local_ip)); + } + cur++; + + if ( *cur != ',') { + /* parse out dev name */ + if ((delim = strchr(cur, ',')) == NULL) + goto parse_failed; + *delim=0; + strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + cur=delim; + } + cur++; + + printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name); + + if ( *cur != '@' ) { + /* dst port */ + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim=0; + np->remote_port=simple_strtol(cur, NULL, 10); + cur=delim; + } + cur++; + printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); + + /* dst ip */ + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim=0; + np->remote_ip=ntohl(in_aton(cur)); + cur=delim+1; + + printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->remote_ip)); + + if( *cur != 0 ) + { + /* MAC address */ + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[0]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[1]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[2]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[3]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[4]=simple_strtol(cur, NULL, 16); + cur=delim+1; + np->remote_mac[5]=simple_strtol(cur, NULL, 16); + } + + printk(KERN_INFO "%s: remote ethernet address " + "%02x:%02x:%02x:%02x:%02x:%02x\n", + np->name, + np->remote_mac[0], + np->remote_mac[1], + np->remote_mac[2], + np->remote_mac[3], + np->remote_mac[4], + np->remote_mac[5]); + + return 0; + + parse_failed: + printk(KERN_INFO "%s: couldn't parse config at %s!\n", + np->name, cur); + return -1; +} + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + + np->poll_lock = SPIN_LOCK_UNLOCKED; + np->poll_owner = -1; + + if (np->dev_name) + ndev = dev_get_by_name(np->dev_name); + if (!ndev) { + printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", + np->name, np->dev_name); + return -1; + } + + np->dev = ndev; + ndev->np = np; + + if (!ndev->poll_controller) { + printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", + np->name, np->dev_name); + goto release; + } + + if (!netif_running(ndev)) { + unsigned long atmost, atleast; + + printk(KERN_INFO "%s: device %s not up yet, forcing it\n", + np->name, np->dev_name); + + rtnl_shlock(); + if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) { + printk(KERN_ERR "%s: failed to open %s\n", + np->name, np->dev_name); + rtnl_shunlock(); + goto release; + } + rtnl_shunlock(); + + atleast = jiffies + HZ/10; + atmost = jiffies + 4*HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + printk(KERN_NOTICE + "%s: timeout waiting for carrier\n", + np->name); + break; + } + cond_resched(); + } + + /* If carrier appears to come up instantly, we don't + * trust it and pause so that we don't pump all our + * queued console messages into the bitbucket. + */ + + if (time_before(jiffies, atleast)) { + printk(KERN_NOTICE "%s: carrier detect appears" + " untrustworthy, waiting 4 seconds\n", + np->name); + msleep(4000); + } + } + + if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr) + memcpy(np->local_mac, ndev->dev_addr, 6); + + if (!np->local_ip) { + rcu_read_lock(); + in_dev = __in_dev_get(ndev); + + if (!in_dev || !in_dev->ifa_list) { + rcu_read_unlock(); + printk(KERN_ERR "%s: no IP address for %s, aborting\n", + np->name, np->dev_name); + goto release; + } + + np->local_ip = ntohl(in_dev->ifa_list->ifa_local); + rcu_read_unlock(); + printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->local_ip)); + } + + if(np->rx_hook) + np->rx_flags = NETPOLL_RX_ENABLED; + + return 0; + + release: + ndev->np = NULL; + np->dev = NULL; + dev_put(ndev); + return -1; +} + +void netpoll_cleanup(struct netpoll *np) +{ + if (np->dev) + np->dev->np = NULL; + dev_put(np->dev); + np->dev = NULL; +} + +int netpoll_trap(void) +{ + return atomic_read(&trapped); +} + +void netpoll_set_trap(int trap) +{ + if (trap) + atomic_inc(&trapped); + else + atomic_dec(&trapped); +} + +EXPORT_SYMBOL(netpoll_set_trap); +EXPORT_SYMBOL(netpoll_trap); +EXPORT_SYMBOL(netpoll_parse_options); +EXPORT_SYMBOL(netpoll_setup); +EXPORT_SYMBOL(netpoll_cleanup); +EXPORT_SYMBOL(netpoll_send_udp); +EXPORT_SYMBOL(netpoll_poll); +EXPORT_SYMBOL(netpoll_queue); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 000000000000..c57b06bc79f3 --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,3132 @@ +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se> + * Uppsala University and + * Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Ben Greear <greearb@candelatech.com> + * Jens Låås <jens.laas@data.slu.se> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module. Parameters are output + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated. 020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + * * Convert to threaded model to more efficiently be able to transmit + * and receive on multiple interfaces at once. + * * Converted many counters to __u64 to allow longer runs. + * * Allow configuration of ranges, like min/max IP address, MACs, + * and UDP-ports, for both source and destination, and can + * set to use a random distribution or sequentially walk the range. + * * Can now change most values after starting. + * * Place 12-byte packet in UDP payload with magic number, + * sequence number, and timestamp. + * * Add receiver code that detects dropped pkts, re-ordered pkts, and + * latencies (with micro-second) precision. + * * Add IOCTL interface to easily get counters & configuration. + * --Ben Greear <greearb@candelatech.com> + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0 + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100 + * clones. + * + * Also moved to /proc/net/pktgen/ + * --ro + * + * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever + * mistakes. Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear <greearb@candelatech.com> + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + * + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.txt for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks + * for running devices in the if_list and sends packets until count is 0 it + * also the thread checks the thread->control which is used for inter-process + * communication. controlling process "posts" operations to the threads this + * way. The if_lock should be possible to remove when add/rem_device is merged + * into this too. + * + * By design there should only be *one* "controlling" process. In practice + * multiple write accesses gives unpredictable result. Understood by "write" + * to /proc gives result code thats should be read be the "writer". + * For pratical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org> + * + * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419 + * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger + * <shemminger@osdl.org> 040923 + * + * Rany Dunlap fixed u64 printk compiler waring + * + * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org> + * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com) + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com> + * 050103 + */ +#include <linux/sys.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/smp_lock.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/inet.h> +#include <linux/inetdevice.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <linux/proc_fs.h> +#include <linux/wait.h> +#include <net/checksum.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <asm/byteorder.h> +#include <linux/rcupdate.h> +#include <asm/bitops.h> +#include <asm/io.h> +#include <asm/dma.h> +#include <asm/uaccess.h> +#include <asm/div64.h> /* do_div */ +#include <asm/timex.h> + + +#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" + +/* #define PG_DEBUG(a) a */ +#define PG_DEBUG(a) + +/* The buckets are exponential in 'width' */ +#define LAT_BUCKETS_MAX 32 +#define IP_NAME_SZ 32 + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ + +/* Thread control flag bits */ +#define T_TERMINATE (1<<0) +#define T_STOP (1<<1) /* Stop run */ +#define T_RUN (1<<2) /* Start run */ +#define T_REMDEV (1<<3) /* Remove all devs */ + +/* Locks */ +#define thread_lock() spin_lock(&_thread_lock) +#define thread_unlock() spin_unlock(&_thread_lock) + +/* If lock -- can be removed after some work */ +#define if_lock(t) spin_lock(&(t->if_lock)); +#define if_unlock(t) spin_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" + +#define MAX_CFLOWS 65536 + +struct flow_state +{ + __u32 cur_daddr; + int count; +}; + +struct pktgen_dev { + + /* + * Try to keep frequent/infrequent used vars. separated. + */ + + char ifname[32]; + struct proc_dir_entry *proc_ent; + char result[512]; + /* proc file names */ + char fname[80]; + + struct pktgen_thread* pg_thread; /* the owner */ + struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */ + + int running; /* if this changes to false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + + int min_pkt_size; /* = ETH_ZLEN; */ + int max_pkt_size; /* = ETH_ZLEN; */ + int nfrags; + __u32 delay_us; /* Default delay */ + __u32 delay_ns; + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + + /* runtime counters relating to clone_skb */ + __u64 next_tx_us; /* timestamp of when to tx next */ + __u32 next_tx_ns; + + __u64 allocated_skbs; + __u32 clone_count; + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? This will keep + * sequence numbers in order, for example. + */ + __u64 started_at; /* micro-seconds */ + __u64 stopped_at; /* micro-seconds */ + __u64 idle_acc; /* micro-seconds */ + __u32 seq_num; + + int clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many coppies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[6]; + unsigned char src_mac[6]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __u32 cur_saddr; + __u32 cur_daddr; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u32 cur_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff* skb; /* skb we are to transmit next, mainly used for when we + * are transmitting the same one multiple times + */ + struct net_device* odev; /* The out-going device. Note that the device should + * have it's pg_info pointer pointing back to this + * device. This will be set when the user specifies + * the out-going device name (not when the inject is + * started as it used to do.) + */ + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ +}; + +struct pktgen_hdr { + __u32 pgh_magic; + __u32 seq_num; + __u32 tv_sec; + __u32 tv_usec; +}; + +struct pktgen_thread { + spinlock_t if_lock; + struct pktgen_dev *if_list; /* All device here */ + struct pktgen_thread* next; + char name[32]; + char fname[128]; /* name of proc file */ + struct proc_dir_entry *proc_ent; + char result[512]; + u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ + + /* Field for thread to receive "posted" events terminate, stop ifs etc.*/ + + u32 control; + int pid; + int cpu; + + wait_queue_head_t queue; +}; + +#define REMOVE 1 +#define FIND 0 + +/* This code works around the fact that do_div cannot handle two 64-bit + numbers, and regular 64-bit division doesn't work on x86 kernels. + --Ben +*/ + +#define PG_DIV 0 + +/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net> + * Function copied/adapted/optimized from: + * + * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html + * + * Copyright 1994, University of Cambridge Computer Laboratory + * All Rights Reserved. + * + */ +inline static s64 divremdi3(s64 x, s64 y, int type) +{ + u64 a = (x < 0) ? -x : x; + u64 b = (y < 0) ? -y : y; + u64 res = 0, d = 1; + + if (b > 0) { + while (b < a) { + b <<= 1; + d <<= 1; + } + } + + do { + if ( a >= b ) { + a -= b; + res += d; + } + b >>= 1; + d >>= 1; + } + while (d); + + if (PG_DIV == type) { + return (((x ^ y) & (1ll<<63)) == 0) ? res : -(s64)res; + } + else { + return ((x & (1ll<<63)) == 0) ? a : -(s64)a; + } +} + +/* End of hacks to deal with 64-bit math on x86 */ + +/** Convert to miliseconds */ +static inline __u64 tv_to_ms(const struct timeval* tv) +{ + __u64 ms = tv->tv_usec / 1000; + ms += (__u64)tv->tv_sec * (__u64)1000; + return ms; +} + + +/** Convert to micro-seconds */ +static inline __u64 tv_to_us(const struct timeval* tv) +{ + __u64 us = tv->tv_usec; + us += (__u64)tv->tv_sec * (__u64)1000000; + return us; +} + +static inline __u64 pg_div(__u64 n, __u32 base) { + __u64 tmp = n; + do_div(tmp, base); + /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n", + n, base, tmp); */ + return tmp; +} + +static inline __u64 pg_div64(__u64 n, __u64 base) +{ + __u64 tmp = n; +/* + * How do we know if the architectrure we are running on + * supports division with 64 bit base? + * + */ +#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__) + + do_div(tmp, base); +#else + tmp = divremdi3(n, base, PG_DIV); +#endif + return tmp; +} + +static inline u32 pktgen_random(void) +{ +#if 0 + __u32 n; + get_random_bytes(&n, 4); + return n; +#else + return net_random(); +#endif +} + +static inline __u64 getCurMs(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv_to_ms(&tv); +} + +static inline __u64 getCurUs(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv_to_us(&tv); +} + +static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b) +{ + return tv_to_us(a) - tv_to_us(b); +} + + +/* old include end */ + +static char version[] __initdata = VERSION; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos); +static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos); +static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); + +static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); +static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int create_proc_dir(void); +static int remove_proc_dir(void); + +static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread* t, const char* ifname); +static struct pktgen_thread* pktgen_find_thread(const char* name); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(void); +static void pktgen_stop_all_threads_ifs(void); +static int pktgen_stop_device(struct pktgen_dev *pkt_dev); +static void pktgen_stop(struct pktgen_thread* t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static struct pktgen_dev *pktgen_NN_threads(const char* dev_name, int remove); +static unsigned int scan_ip6(const char *s,char ip[16]); +static unsigned int fmt_ip6(char *s,const char ip[16]); + +/* Module parameters, defaults. */ +static int pg_count_d = 1000; /* 1000 pkts by default */ +static int pg_delay_d = 0; +static int pg_clone_skb_d = 0; +static int debug = 0; + +static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED; +static struct pktgen_thread *pktgen_threads = NULL; + +static char module_fname[128]; +static struct proc_dir_entry *module_proc_ent = NULL; + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + +static struct file_operations pktgen_fops = { + .read = proc_pgctrl_read, + .write = proc_pgctrl_write, + /* .ioctl = pktgen_ioctl, later maybe */ +}; + +/* + * /proc handling functions + * + */ + +static struct proc_dir_entry *pg_proc_dir = NULL; +static int proc_pgctrl_read_eof=0; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, + size_t count, loff_t *ppos) +{ + char data[200]; + int len = 0; + + if(proc_pgctrl_read_eof) { + proc_pgctrl_read_eof=0; + len = 0; + goto out; + } + + sprintf(data, "%s", VERSION); + + len = strlen(data); + + if(len > count) { + len =-EFAULT; + goto out; + } + + if (copy_to_user(buf, data, len)) { + len =-EFAULT; + goto out; + } + + *ppos += len; + proc_pgctrl_read_eof=1; /* EOF next call */ + + out: + return len; +} + +static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf, + size_t count, loff_t *ppos) +{ + char *data = NULL; + int err = 0; + + if (!capable(CAP_NET_ADMIN)){ + err = -EPERM; + goto out; + } + + data = (void*)vmalloc ((unsigned int)count); + + if(!data) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(data, buf, count)) { + err =-EFAULT; + goto out_free; + } + data[count-1] = 0; /* Make string */ + + if (!strcmp(data, "stop")) + pktgen_stop_all_threads_ifs(); + + else if (!strcmp(data, "start")) + pktgen_run_all_threads(); + + else + printk("pktgen: Unknown command: %s\n", data); + + err = count; + + out_free: + vfree (data); + out: + return err; +} + +static int proc_if_read(char *buf , char **start, off_t offset, + int len, int *eof, void *data) +{ + char *p; + int i; + struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); + __u64 sa; + __u64 stopped; + __u64 now = getCurUs(); + + p = buf; + p += sprintf(p, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long) pkt_dev->count, + pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + + p += sprintf(p, " frags: %d delay: %u clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname); + + p += sprintf(p, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); + + + if(pkt_dev->flags & F_IPV6) { + char b1[128], b2[128], b3[128]; + fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); + fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr); + fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr); + p += sprintf(p, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3); + + fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); + fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr); + fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr); + p += sprintf(p, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3); + + } + else + p += sprintf(p, " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max); + + p += sprintf(p, " src_mac: "); + + if ((pkt_dev->src_mac[0] == 0) && + (pkt_dev->src_mac[1] == 0) && + (pkt_dev->src_mac[2] == 0) && + (pkt_dev->src_mac[3] == 0) && + (pkt_dev->src_mac[4] == 0) && + (pkt_dev->src_mac[5] == 0)) + + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":"); + + else + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":"); + + p += sprintf(p, "dst_mac: "); + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":"); + + p += sprintf(p, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, + pkt_dev->udp_dst_max); + + p += sprintf(p, " src_mac_count: %d dst_mac_count: %d \n Flags: ", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + + if (pkt_dev->flags & F_IPV6) + p += sprintf(p, "IPV6 "); + + if (pkt_dev->flags & F_IPSRC_RND) + p += sprintf(p, "IPSRC_RND "); + + if (pkt_dev->flags & F_IPDST_RND) + p += sprintf(p, "IPDST_RND "); + + if (pkt_dev->flags & F_TXSIZE_RND) + p += sprintf(p, "TXSIZE_RND "); + + if (pkt_dev->flags & F_UDPSRC_RND) + p += sprintf(p, "UDPSRC_RND "); + + if (pkt_dev->flags & F_UDPDST_RND) + p += sprintf(p, "UDPDST_RND "); + + if (pkt_dev->flags & F_MACSRC_RND) + p += sprintf(p, "MACSRC_RND "); + + if (pkt_dev->flags & F_MACDST_RND) + p += sprintf(p, "MACDST_RND "); + + + p += sprintf(p, "\n"); + + sa = pkt_dev->started_at; + stopped = pkt_dev->stopped_at; + if (pkt_dev->running) + stopped = now; /* not really stopped, more like last-running-at */ + + p += sprintf(p, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) pkt_dev->sofar, + (unsigned long long) pkt_dev->errors, + (unsigned long long) sa, + (unsigned long long) stopped, + (unsigned long long) pkt_dev->idle_acc); + + p += sprintf(p, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset); + + if(pkt_dev->flags & F_IPV6) { + char b1[128], b2[128]; + fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); + fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); + p += sprintf(p, " cur_saddr: %s cur_daddr: %s\n", b2, b1); + } + else + p += sprintf(p, " cur_saddr: 0x%x cur_daddr: 0x%x\n", + pkt_dev->cur_saddr, pkt_dev->cur_daddr); + + + p += sprintf(p, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + + p += sprintf(p, " flows: %u\n", pkt_dev->nflows); + + if (pkt_dev->result[0]) + p += sprintf(p, "Result: %s\n", pkt_dev->result); + else + p += sprintf(p, "Result: Idle\n"); + *eof = 1; + + return p - buf; +} + + +static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + case '=': + break; + default: + goto done; + }; + } +done: + return i; +} + +static unsigned long num_arg(const char __user *user_buffer, unsigned long maxlen, + unsigned long *num) +{ + int i = 0; + *num = 0; + + for(; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + if ((c >= '0') && (c <= '9')) { + *num *= 10; + *num += c -'0'; + } else + break; + } + return i; +} + +static int strn_len(const char __user *user_buffer, unsigned int maxlen) +{ + int i = 0; + + for(; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + goto done_str; + break; + default: + break; + }; + } +done_str: + + return i; +} + +static int proc_if_write(struct file *file, const char __user *user_buffer, + unsigned long count, void *data) +{ + int i = 0, max, len; + char name[16], valstr[32]; + unsigned long value = 0; + struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); + char* pg_result = NULL; + int tmp = 0; + char buf[128]; + + pg_result = &(pkt_dev->result[0]); + + if (count < 1) { + printk("pktgen: wrong command format\n"); + return -EINVAL; + } + + max = count - i; + tmp = count_trail_chars(&user_buffer[i], max); + if (tmp < 0) { + printk("pktgen: illegal format\n"); + return tmp; + } + i += tmp; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) { return len; } + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len) ) + return -EFAULT; + i += len; + + max = count -i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) { + char tb[count + 1]; + if (copy_from_user(tb, user_buffer, count)) + return -EFAULT; + tb[count] = 0; + printk("pktgen: %s,%lu buffer -:%s:-\n", name, count, tb); + } + + if (!strcmp(name, "min_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: min_pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "max_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->max_pkt_size) { + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: max_pkt_size=%u", pkt_dev->max_pkt_size); + return count; + } + + /* Shortcut for min = max */ + + if (!strcmp(name, "pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "debug")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + debug = value; + sprintf(pg_result, "OK: debug=%u", debug); + return count; + } + + if (!strcmp(name, "frags")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->nfrags = value; + sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + return count; + } + if (!strcmp(name, "delay")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value == 0x7FFFFFFF) { + pkt_dev->delay_us = 0x7FFFFFFF; + pkt_dev->delay_ns = 0; + } else { + pkt_dev->delay_us = value / 1000; + pkt_dev->delay_ns = value % 1000; + } + sprintf(pg_result, "OK: delay=%u", 1000*pkt_dev->delay_us+pkt_dev->delay_ns); + return count; + } + if (!strcmp(name, "udp_src_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_src_min) { + pkt_dev->udp_src_min = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); + return count; + } + if (!strcmp(name, "udp_dst_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_dst_min) { + pkt_dev->udp_dst_min = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); + return count; + } + if (!strcmp(name, "udp_src_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_src_max) { + pkt_dev->udp_src_max = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); + return count; + } + if (!strcmp(name, "udp_dst_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_dst_max) { + pkt_dev->udp_dst_max = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); + return count; + } + if (!strcmp(name, "clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->clone_skb = value; + + sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); + return count; + } + if (!strcmp(name, "count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->count = value; + sprintf(pg_result, "OK: count=%llu", + (unsigned long long) pkt_dev->count); + return count; + } + if (!strcmp(name, "src_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (pkt_dev->src_mac_count != value) { + pkt_dev->src_mac_count = value; + pkt_dev->cur_src_mac_offset = 0; + } + sprintf(pg_result, "OK: src_mac_count=%d", pkt_dev->src_mac_count); + return count; + } + if (!strcmp(name, "dst_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (pkt_dev->dst_mac_count != value) { + pkt_dev->dst_mac_count = value; + pkt_dev->cur_dst_mac_offset = 0; + } + sprintf(pg_result, "OK: dst_mac_count=%d", pkt_dev->dst_mac_count); + return count; + } + if (!strcmp(name, "flag")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { return len; } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + if (strcmp(f, "IPSRC_RND") == 0) + pkt_dev->flags |= F_IPSRC_RND; + + else if (strcmp(f, "!IPSRC_RND") == 0) + pkt_dev->flags &= ~F_IPSRC_RND; + + else if (strcmp(f, "TXSIZE_RND") == 0) + pkt_dev->flags |= F_TXSIZE_RND; + + else if (strcmp(f, "!TXSIZE_RND") == 0) + pkt_dev->flags &= ~F_TXSIZE_RND; + + else if (strcmp(f, "IPDST_RND") == 0) + pkt_dev->flags |= F_IPDST_RND; + + else if (strcmp(f, "!IPDST_RND") == 0) + pkt_dev->flags &= ~F_IPDST_RND; + + else if (strcmp(f, "UDPSRC_RND") == 0) + pkt_dev->flags |= F_UDPSRC_RND; + + else if (strcmp(f, "!UDPSRC_RND") == 0) + pkt_dev->flags &= ~F_UDPSRC_RND; + + else if (strcmp(f, "UDPDST_RND") == 0) + pkt_dev->flags |= F_UDPDST_RND; + + else if (strcmp(f, "!UDPDST_RND") == 0) + pkt_dev->flags &= ~F_UDPDST_RND; + + else if (strcmp(f, "MACSRC_RND") == 0) + pkt_dev->flags |= F_MACSRC_RND; + + else if (strcmp(f, "!MACSRC_RND") == 0) + pkt_dev->flags &= ~F_MACSRC_RND; + + else if (strcmp(f, "MACDST_RND") == 0) + pkt_dev->flags |= F_MACDST_RND; + + else if (strcmp(f, "!MACDST_RND") == 0) + pkt_dev->flags &= ~F_MACDST_RND; + + else { + sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", + f, + "IPSRC_RND, IPDST_RND, TXSIZE_RND, UDPSRC_RND, UDPDST_RND, MACSRC_RND, MACDST_RND\n"); + return count; + } + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } + if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + if (len < 0) { return len; } + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_min) != 0) { + memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); + strncpy(pkt_dev->dst_min, buf, len); + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->cur_daddr = pkt_dev->daddr_min; + } + if(debug) + printk("pktgen: dst_min set to: %s\n", pkt_dev->dst_min); + i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); + return count; + } + if (!strcmp(name, "dst_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + if (len < 0) { return len; } + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_max) != 0) { + memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); + strncpy(pkt_dev->dst_max, buf, len); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + pkt_dev->cur_daddr = pkt_dev->daddr_max; + } + if(debug) + printk("pktgen: dst_max set to: %s\n", pkt_dev->dst_max); + i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); + return count; + } + if (!strcmp(name, "dst6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + + if(debug) + printk("pktgen: dst6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6=%s", buf); + return count; + } + if (!strcmp(name, "dst6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr); + if(debug) + printk("pktgen: dst6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_min=%s", buf); + return count; + } + if (!strcmp(name, "dst6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + + if(debug) + printk("pktgen: dst6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_max=%s", buf); + return count; + } + if (!strcmp(name, "src6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + + if(debug) + printk("pktgen: src6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6=%s", buf); + return count; + } + if (!strcmp(name, "src_min")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + if (len < 0) { return len; } + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_min) != 0) { + memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); + strncpy(pkt_dev->src_min, buf, len); + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->cur_saddr = pkt_dev->saddr_min; + } + if(debug) + printk("pktgen: src_min set to: %s\n", pkt_dev->src_min); + i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); + return count; + } + if (!strcmp(name, "src_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + if (len < 0) { return len; } + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_max) != 0) { + memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); + strncpy(pkt_dev->src_max, buf, len); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + pkt_dev->cur_saddr = pkt_dev->saddr_max; + } + if(debug) + printk("pktgen: src_max set to: %s\n", pkt_dev->src_max); + i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); + return count; + } + if (!strcmp(name, "dst_mac")) { + char *v = valstr; + unsigned char old_dmac[6]; + unsigned char *m = pkt_dev->dst_mac; + memcpy(old_dmac, pkt_dev->dst_mac, 6); + + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) { return len; } + memset(valstr, 0, sizeof(valstr)); + if( copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + i += len; + + for(*m = 0;*v && m < pkt_dev->dst_mac + 6; v++) { + if (*v >= '0' && *v <= '9') { + *m *= 16; + *m += *v - '0'; + } + if (*v >= 'A' && *v <= 'F') { + *m *= 16; + *m += *v - 'A' + 10; + } + if (*v >= 'a' && *v <= 'f') { + *m *= 16; + *m += *v - 'a' + 10; + } + if (*v == ':') { + m++; + *m = 0; + } + } + + /* Set up Dest MAC */ + if (memcmp(old_dmac, pkt_dev->dst_mac, 6) != 0) + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); + + sprintf(pg_result, "OK: dstmac"); + return count; + } + if (!strcmp(name, "src_mac")) { + char *v = valstr; + unsigned char *m = pkt_dev->src_mac; + + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) { return len; } + memset(valstr, 0, sizeof(valstr)); + if( copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + i += len; + + for(*m = 0;*v && m < pkt_dev->src_mac + 6; v++) { + if (*v >= '0' && *v <= '9') { + *m *= 16; + *m += *v - '0'; + } + if (*v >= 'A' && *v <= 'F') { + *m *= 16; + *m += *v - 'A' + 10; + } + if (*v >= 'a' && *v <= 'f') { + *m *= 16; + *m += *v - 'a' + 10; + } + if (*v == ':') { + m++; + *m = 0; + } + } + + sprintf(pg_result, "OK: srcmac"); + return count; + } + + if (!strcmp(name, "clear_counters")) { + pktgen_clear_counters(pkt_dev); + sprintf(pg_result, "OK: Clearing counters.\n"); + return count; + } + + if (!strcmp(name, "flows")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value > MAX_CFLOWS) + value = MAX_CFLOWS; + + pkt_dev->cflows = value; + sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); + return count; + } + + if (!strcmp(name, "flowlen")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->lflow = value; + sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); + return count; + } + + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); + return -EINVAL; +} + +static int proc_thread_read(char *buf , char **start, off_t offset, + int len, int *eof, void *data) +{ + char *p; + struct pktgen_thread *t = (struct pktgen_thread*)(data); + struct pktgen_dev *pkt_dev = NULL; + + + if (!t) { + printk("pktgen: ERROR: could not find thread in proc_thread_read\n"); + return -EINVAL; + } + + p = buf; + p += sprintf(p, "Name: %s max_before_softirq: %d\n", + t->name, t->max_before_softirq); + + p += sprintf(p, "Running: "); + + if_lock(t); + for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) + if(pkt_dev->running) + p += sprintf(p, "%s ", pkt_dev->ifname); + + p += sprintf(p, "\nStopped: "); + + for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) + if(!pkt_dev->running) + p += sprintf(p, "%s ", pkt_dev->ifname); + + if (t->result[0]) + p += sprintf(p, "\nResult: %s\n", t->result); + else + p += sprintf(p, "\nResult: NA\n"); + + *eof = 1; + + if_unlock(t); + + return p - buf; +} + +static int proc_thread_write(struct file *file, const char __user *user_buffer, + unsigned long count, void *data) +{ + int i = 0, max, len, ret; + char name[40]; + struct pktgen_thread *t; + char *pg_result; + unsigned long value = 0; + + if (count < 1) { + // sprintf(pg_result, "Wrong command format"); + return -EINVAL; + } + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count -i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) + printk("pktgen: t=%s, count=%lu\n", name, count); + + + t = (struct pktgen_thread*)(data); + if(!t) { + printk("pktgen: ERROR: No thread\n"); + ret = -EINVAL; + goto out; + } + + pg_result = &(t->result[0]); + + if (!strcmp(name, "add_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if( copy_from_user(f, &user_buffer[i], len) ) + return -EFAULT; + i += len; + thread_lock(); + pktgen_add_device(t, f); + thread_unlock(); + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + goto out; + } + + if (!strcmp(name, "rem_device_all")) { + thread_lock(); + t->control |= T_REMDEV; + thread_unlock(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/8); /* Propagate thread->control */ + ret = count; + sprintf(pg_result, "OK: rem_device_all"); + goto out; + } + + if (!strcmp(name, "max_before_softirq")) { + len = num_arg(&user_buffer[i], 10, &value); + thread_lock(); + t->max_before_softirq = value; + thread_unlock(); + ret = count; + sprintf(pg_result, "OK: max_before_softirq=%lu", value); + goto out; + } + + ret = -EINVAL; + out: + + return ret; +} + +static int create_proc_dir(void) +{ + int len; + /* does proc_dir already exists */ + len = strlen(PG_PROC_DIR); + + for (pg_proc_dir = proc_net->subdir; pg_proc_dir; pg_proc_dir=pg_proc_dir->next) { + if ((pg_proc_dir->namelen == len) && + (! memcmp(pg_proc_dir->name, PG_PROC_DIR, len))) + break; + } + + if (!pg_proc_dir) + pg_proc_dir = create_proc_entry(PG_PROC_DIR, S_IFDIR, proc_net); + + if (!pg_proc_dir) + return -ENODEV; + + return 0; +} + +static int remove_proc_dir(void) +{ + remove_proc_entry(PG_PROC_DIR, proc_net); + return 0; +} + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove) +{ + struct pktgen_thread *t; + struct pktgen_dev *pkt_dev = NULL; + + t = pktgen_threads; + + while (t) { + pkt_dev = pktgen_find_dev(t, ifname); + if (pkt_dev) { + if(remove) { + if_lock(t); + pktgen_remove_device(t, pkt_dev); + if_unlock(t); + } + break; + } + t = t->next; + } + return pkt_dev; +} + +static struct pktgen_dev *pktgen_NN_threads(const char* ifname, int remove) +{ + struct pktgen_dev *pkt_dev = NULL; + thread_lock(); + pkt_dev = __pktgen_NN_threads(ifname, remove); + thread_unlock(); + return pkt_dev; +} + +static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = (struct net_device *)(ptr); + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_DOWN: + case NETDEV_UP: + /* Ignore for now */ + break; + + case NETDEV_UNREGISTER: + pktgen_NN_threads(dev->name, REMOVE); + break; + }; + + return NOTIFY_DONE; +} + +/* Associate pktgen_dev with a device. */ + +static struct net_device* pktgen_setup_dev(struct pktgen_dev *pkt_dev) { + struct net_device *odev; + + /* Clean old setups */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + odev = dev_get_by_name(pkt_dev->ifname); + + if (!odev) { + printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); + goto out; + } + if (odev->type != ARPHRD_ETHER) { + printk("pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname); + goto out_put; + } + if (!netif_running(odev)) { + printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname); + goto out_put; + } + pkt_dev->odev = odev; + + return pkt_dev->odev; + +out_put: + dev_put(odev); +out: + return NULL; + +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ + /* Try once more, just in case it works now. */ + if (!pkt_dev->odev) + pktgen_setup_dev(pkt_dev); + + if (!pkt_dev->odev) { + printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + return; + } + + /* Default to the interface's mac if not explicitly set. */ + + if ((pkt_dev->src_mac[0] == 0) && + (pkt_dev->src_mac[1] == 0) && + (pkt_dev->src_mac[2] == 0) && + (pkt_dev->src_mac[3] == 0) && + (pkt_dev->src_mac[4] == 0) && + (pkt_dev->src_mac[5] == 0)) { + + memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, 6); + } + /* Set up Dest MAC */ + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); + + /* Set up pkt size */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + + if(pkt_dev->flags & F_IPV6) { + /* + * Skip this automatic address setting until locks or functions + * gets exported + */ + +#ifdef NOTNOW + int i, set = 0, err=1; + struct inet6_dev *idev; + + for(i=0; i< IN6_ADDR_HSIZE; i++) + if(pkt_dev->cur_in6_saddr.s6_addr[i]) { + set = 1; + break; + } + + if(!set) { + + /* + * Use linklevel address if unconfigured. + * + * use ipv6_get_lladdr if/when it's get exported + */ + + + read_lock(&addrconf_lock); + if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + if(err) printk("pktgen: ERROR: IPv6 link address not availble.\n"); + } +#endif + } + else { + pkt_dev->saddr_min = 0; + pkt_dev->saddr_max = 0; + if (strlen(pkt_dev->src_min) == 0) { + + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(pkt_dev->odev); + if (in_dev) { + if (in_dev->ifa_list) { + pkt_dev->saddr_min = in_dev->ifa_list->ifa_address; + pkt_dev->saddr_max = pkt_dev->saddr_min; + } + __in_dev_put(in_dev); + } + rcu_read_unlock(); + } + else { + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + } + + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + } + /* Initialize current values. */ + pkt_dev->cur_dst_mac_offset = 0; + pkt_dev->cur_src_mac_offset = 0; + pkt_dev->cur_saddr = pkt_dev->saddr_min; + pkt_dev->cur_daddr = pkt_dev->daddr_min; + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + pkt_dev->nflows = 0; +} + +static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) +{ + __u64 start; + __u64 now; + + start = now = getCurUs(); + printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now)); + while (now < spin_until_us) { + /* TODO: optimise sleeping behavior */ + if (spin_until_us - now > (1000000/HZ)+1) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } else if (spin_until_us - now > 100) { + do_softirq(); + if (!pkt_dev->running) + return; + if (need_resched()) + schedule(); + } + + now = getCurUs(); + } + + pkt_dev->idle_acc += now - start; +} + + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) { + __u32 imn; + __u32 imx; + int flow = 0; + + if(pkt_dev->cflows) { + flow = pktgen_random() % pkt_dev->cflows; + + if (pkt_dev->flows[flow].count > pkt_dev->lflow) + pkt_dev->flows[flow].count = 0; + } + + + /* Deal with source MAC */ + if (pkt_dev->src_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACSRC_RND) + mc = pktgen_random() % (pkt_dev->src_mac_count); + else { + mc = pkt_dev->cur_src_mac_offset++; + if (pkt_dev->cur_src_mac_offset > pkt_dev->src_mac_count) + pkt_dev->cur_src_mac_offset = 0; + } + + tmp = pkt_dev->src_mac[5] + (mc & 0xFF); + pkt_dev->hh[11] = tmp; + tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[10] = tmp; + tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[9] = tmp; + tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[8] = tmp; + tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); + pkt_dev->hh[7] = tmp; + } + + /* Deal with Destination MAC */ + if (pkt_dev->dst_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACDST_RND) + mc = pktgen_random() % (pkt_dev->dst_mac_count); + + else { + mc = pkt_dev->cur_dst_mac_offset++; + if (pkt_dev->cur_dst_mac_offset > pkt_dev->dst_mac_count) { + pkt_dev->cur_dst_mac_offset = 0; + } + } + + tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); + pkt_dev->hh[5] = tmp; + tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[4] = tmp; + tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[3] = tmp; + tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[2] = tmp; + tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); + pkt_dev->hh[1] = tmp; + } + + if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { + if (pkt_dev->flags & F_UDPSRC_RND) + pkt_dev->cur_udp_src = ((pktgen_random() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + pkt_dev->udp_src_min); + + else { + pkt_dev->cur_udp_src++; + if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + } + } + + if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { + if (pkt_dev->flags & F_UDPDST_RND) { + pkt_dev->cur_udp_dst = ((pktgen_random() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + pkt_dev->udp_dst_min); + } + else { + pkt_dev->cur_udp_dst++; + if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + } + } + + if (!(pkt_dev->flags & F_IPV6)) { + + if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = ntohl(pkt_dev->saddr_max))) { + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) + t = ((pktgen_random() % (imx - imn)) + imn); + else { + t = ntohl(pkt_dev->cur_saddr); + t++; + if (t > imx) { + t = imn; + } + } + pkt_dev->cur_saddr = htonl(t); + } + + if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) { + pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; + } else { + + if ((imn = ntohl(pkt_dev->daddr_min)) < (imx = ntohl(pkt_dev->daddr_max))) { + __u32 t; + if (pkt_dev->flags & F_IPDST_RND) { + + t = ((pktgen_random() % (imx - imn)) + imn); + t = htonl(t); + + while( LOOPBACK(t) || MULTICAST(t) || BADCLASS(t) || ZERONET(t) || LOCAL_MCAST(t) ) { + t = ((pktgen_random() % (imx - imn)) + imn); + t = htonl(t); + } + pkt_dev->cur_daddr = t; + } + + else { + t = ntohl(pkt_dev->cur_daddr); + t++; + if (t > imx) { + t = imn; + } + pkt_dev->cur_daddr = htonl(t); + } + } + if(pkt_dev->cflows) { + pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; + pkt_dev->nflows++; + } + } + } + else /* IPV6 * */ + { + if(pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[3] == 0); + else { + int i; + + /* Only random destinations yet */ + + for(i=0; i < 4; i++) { + pkt_dev->cur_in6_daddr.s6_addr32[i] = + ((pktgen_random() | + pkt_dev->min_in6_daddr.s6_addr32[i]) & + pkt_dev->max_in6_daddr.s6_addr32[i]); + } + } + } + + if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { + t = ((pktgen_random() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)) + + pkt_dev->min_pkt_size); + } + else { + t = pkt_dev->cur_pkt_size + 1; + if (t > pkt_dev->max_pkt_size) + t = pkt_dev->min_pkt_size; + } + pkt_dev->cur_pkt_size = t; + } + + pkt_dev->flows[flow].count++; +} + + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, iplen; + struct iphdr *iph; + struct pktgen_hdr *pgh = NULL; + + skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); + udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + + memcpy(eth, pkt_dev->hh, 12); + *(u16*)ð[12] = __constant_htons(ETH_P_IP); + + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8; /* Eth + IPh + UDPh */ + if (datalen < sizeof(struct pktgen_hdr)) + datalen = sizeof(struct pktgen_hdr); + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + 8); /* DATA + udphdr */ + udph->check = 0; /* No checksum */ + + iph->ihl = 5; + iph->version = 4; + iph->ttl = 32; + iph->tos = 0; + iph->protocol = IPPROTO_UDP; /* UDP */ + iph->saddr = pkt_dev->cur_saddr; + iph->daddr = pkt_dev->cur_daddr; + iph->frag_off = 0; + iplen = 20 + 8 + datalen; + iph->tot_len = htons(iplen); + iph->check = 0; + iph->check = ip_fast_csum((void *) iph, iph->ihl); + skb->protocol = __constant_htons(ETH_P_IP); + skb->mac.raw = ((u8 *)iph) - 14; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + if (pkt_dev->nfrags <= 0) + pgh = (struct pktgen_hdr *)skb_put(skb, datalen); + else { + int frags = pkt_dev->nfrags; + int i; + + pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + if (datalen > frags*PAGE_SIZE) { + skb_put(skb, datalen-frags*PAGE_SIZE); + datalen = frags*PAGE_SIZE; + } + + i = 0; + while (datalen > 0) { + struct page *page = alloc_pages(GFP_KERNEL, 0); + skb_shinfo(skb)->frags[i].page = page; + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + + while (i < frags) { + int rem; + + if (i == 0) + break; + + rem = skb_shinfo(skb)->frags[i - 1].size / 2; + if (rem == 0) + break; + + skb_shinfo(skb)->frags[i - 1].size -= rem; + + skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; + skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; + skb_shinfo(skb)->frags[i].size = rem; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, convert them to network byte order */ + + if (pgh) { + struct timeval timestamp; + + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } + pkt_dev->seq_num++; + + return skb; +} + +/* + * scan_ip6, fmt_ip taken from dietlibc-0.21 + * Author Felix von Leitner <felix-dietlibc@fefe.de> + * + * Slightly modified for kernel. + * Should be candidate for net/ipv4/utils.c + * --ro + */ + +static unsigned int scan_ip6(const char *s,char ip[16]) +{ + unsigned int i; + unsigned int len=0; + unsigned long u; + char suffix[16]; + unsigned int prefixlen=0; + unsigned int suffixlen=0; + __u32 tmp; + + for (i=0; i<16; i++) ip[i]=0; + + for (;;) { + if (*s == ':') { + len++; + if (s[1] == ':') { /* Found "::", skip to part 2 */ + s+=2; + len++; + break; + } + s++; + } + { + char *tmp; + u=simple_strtoul(s,&tmp,16); + i=tmp-s; + } + + if (!i) return 0; + if (prefixlen==12 && s[i]=='.') { + + /* the last 4 bytes may be written as IPv4 address */ + + tmp = in_aton(s); + memcpy((struct in_addr*)(ip+12), &tmp, sizeof(tmp)); + return i+len; + } + ip[prefixlen++] = (u >> 8); + ip[prefixlen++] = (u & 255); + s += i; len += i; + if (prefixlen==16) + return len; + } + +/* part 2, after "::" */ + for (;;) { + if (*s == ':') { + if (suffixlen==0) + break; + s++; + len++; + } else if (suffixlen!=0) + break; + { + char *tmp; + u=simple_strtol(s,&tmp,16); + i=tmp-s; + } + if (!i) { + if (*s) len--; + break; + } + if (suffixlen+prefixlen<=12 && s[i]=='.') { + tmp = in_aton(s); + memcpy((struct in_addr*)(suffix+suffixlen), &tmp, sizeof(tmp)); + suffixlen+=4; + len+=strlen(s); + break; + } + suffix[suffixlen++] = (u >> 8); + suffix[suffixlen++] = (u & 255); + s += i; len += i; + if (prefixlen+suffixlen==16) + break; + } + for (i=0; i<suffixlen; i++) + ip[16-suffixlen+i] = suffix[i]; + return len; +} + +static char tohex(char hexdigit) { + return hexdigit>9?hexdigit+'a'-10:hexdigit+'0'; +} + +static int fmt_xlong(char* s,unsigned int i) { + char* bak=s; + *s=tohex((i>>12)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex((i>>8)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex((i>>4)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex(i&0xf); + return s-bak+1; +} + +static unsigned int fmt_ip6(char *s,const char ip[16]) { + unsigned int len; + unsigned int i; + unsigned int temp; + unsigned int compressing; + int j; + + len = 0; compressing = 0; + for (j=0; j<16; j+=2) { + +#ifdef V4MAPPEDPREFIX + if (j==12 && !memcmp(ip,V4mappedprefix,12)) { + inet_ntoa_r(*(struct in_addr*)(ip+12),s); + temp=strlen(s); + return len+temp; + } +#endif + temp = ((unsigned long) (unsigned char) ip[j] << 8) + + (unsigned long) (unsigned char) ip[j+1]; + if (temp == 0) { + if (!compressing) { + compressing=1; + if (j==0) { + *s++=':'; ++len; + } + } + } else { + if (compressing) { + compressing=0; + *s++=':'; ++len; + } + i = fmt_xlong(s,temp); len += i; s += i; + if (j<14) { + *s++ = ':'; + ++len; + } + } + } + if (compressing) { + *s++=':'; ++len; + } + *s=0; + return len; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen; + struct ipv6hdr *iph; + struct pktgen_hdr *pgh = NULL; + + skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); + udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + + + memcpy(eth, pkt_dev->hh, 12); + *(u16*)ð[12] = __constant_htons(ETH_P_IPV6); + + + datalen = pkt_dev->cur_pkt_size-14- + sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ + + if (datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + if (net_ratelimit()) + printk(KERN_INFO "pktgen: increased datalen to %d\n", datalen); + } + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + sizeof(struct udphdr)); + udph->check = 0; /* No checksum */ + + *(u32*)iph = __constant_htonl(0x60000000); /* Version + flow */ + + iph->hop_limit = 32; + + iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->nexthdr = IPPROTO_UDP; + + ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); + ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + + skb->mac.raw = ((u8 *)iph) - 14; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + if (pkt_dev->nfrags <= 0) + pgh = (struct pktgen_hdr *)skb_put(skb, datalen); + else { + int frags = pkt_dev->nfrags; + int i; + + pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + if (datalen > frags*PAGE_SIZE) { + skb_put(skb, datalen-frags*PAGE_SIZE); + datalen = frags*PAGE_SIZE; + } + + i = 0; + while (datalen > 0) { + struct page *page = alloc_pages(GFP_KERNEL, 0); + skb_shinfo(skb)->frags[i].page = page; + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + + while (i < frags) { + int rem; + + if (i == 0) + break; + + rem = skb_shinfo(skb)->frags[i - 1].size / 2; + if (rem == 0) + break; + + skb_shinfo(skb)->frags[i - 1].size -= rem; + + skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; + skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; + skb_shinfo(skb)->frags[i].size = rem; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, convert them to network byte order */ + /* should we update cloned packets too ? */ + if (pgh) { + struct timeval timestamp; + + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } + pkt_dev->seq_num++; + + return skb; +} + +static inline struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + if(pkt_dev->flags & F_IPV6) + return fill_packet_ipv6(odev, pkt_dev); + else + return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) +{ + pkt_dev->seq_num = 1; + pkt_dev->idle_acc = 0; + pkt_dev->sofar = 0; + pkt_dev->tx_bytes = 0; + pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev = NULL; + int started = 0; + + PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t)); + + if_lock(t); + for (pkt_dev = t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { + + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if(pkt_dev->odev) { + pktgen_clear_counters(pkt_dev); + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = getCurUs(); + pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ + pkt_dev->next_tx_ns = 0; + + strcpy(pkt_dev->result, "Starting"); + started++; + } + else + strcpy(pkt_dev->result, "Error starting"); + } + if_unlock(t); + if(started) t->control &= ~(T_STOP); +} + +static void pktgen_stop_all_threads_ifs(void) +{ + struct pktgen_thread *t = pktgen_threads; + + PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads.\n")); + + thread_lock(); + while(t) { + pktgen_stop(t); + t = t->next; + } + thread_unlock(); +} + +static int thread_is_running(struct pktgen_thread *t ) +{ + struct pktgen_dev *next; + int res = 0; + + for(next=t->if_list; next; next=next->next) { + if(next->running) { + res = 1; + break; + } + } + return res; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t ) +{ + if_lock(t); + + while(thread_is_running(t)) { + + if_unlock(t); + + msleep_interruptible(100); + + if (signal_pending(current)) + goto signal; + if_lock(t); + } + if_unlock(t); + return 1; + signal: + return 0; +} + +static int pktgen_wait_all_threads_run(void) +{ + struct pktgen_thread *t = pktgen_threads; + int sig = 1; + + while (t) { + sig = pktgen_wait_thread_run(t); + if( sig == 0 ) break; + thread_lock(); + t=t->next; + thread_unlock(); + } + if(sig == 0) { + thread_lock(); + while (t) { + t->control |= (T_STOP); + t=t->next; + } + thread_unlock(); + } + return sig; +} + +static void pktgen_run_all_threads(void) +{ + struct pktgen_thread *t = pktgen_threads; + + PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n")); + + thread_lock(); + + while(t) { + t->control |= (T_RUN); + t = t->next; + } + thread_unlock(); + + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/8); /* Propagate thread->control */ + + pktgen_wait_all_threads_run(); +} + + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ + __u64 total_us, bps, mbps, pps, idle; + char *p = pkt_dev->result; + + total_us = pkt_dev->stopped_at - pkt_dev->started_at; + + idle = pkt_dev->idle_acc; + + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", + (unsigned long long) total_us, + (unsigned long long)(total_us - idle), + (unsigned long long) idle, + (unsigned long long) pkt_dev->sofar, + pkt_dev->cur_pkt_size, nr_frags); + + pps = pkt_dev->sofar * USEC_PER_SEC; + + while ((total_us >> 32) != 0) { + pps >>= 1; + total_us >>= 1; + } + + do_div(pps, total_us); + + bps = pps * 8 * pkt_dev->cur_pkt_size; + + mbps = bps; + do_div(mbps, 1000000); + p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu", + (unsigned long long) pps, + (unsigned long long) mbps, + (unsigned long long) bps, + (unsigned long long) pkt_dev->errors); +} + + +/* Set stopped-at timer, remove from running list, do counters & statistics */ + +static int pktgen_stop_device(struct pktgen_dev *pkt_dev) +{ + + if (!pkt_dev->running) { + printk("pktgen: interface: %s is already stopped\n", pkt_dev->ifname); + return -EINVAL; + } + + pkt_dev->stopped_at = getCurUs(); + pkt_dev->running = 0; + + show_results(pkt_dev, skb_shinfo(pkt_dev->skb)->nr_frags); + + if (pkt_dev->skb) + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = NULL; + + return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t ) +{ + struct pktgen_dev *next, *best = NULL; + + if_lock(t); + + for(next=t->if_list; next ; next=next->next) { + if(!next->running) continue; + if(best == NULL) best=next; + else if ( next->next_tx_us < best->next_tx_us) + best = next; + } + if_unlock(t); + return best; +} + +static void pktgen_stop(struct pktgen_thread *t) { + struct pktgen_dev *next = NULL; + + PG_DEBUG(printk("pktgen: entering pktgen_stop.\n")); + + if_lock(t); + + for(next=t->if_list; next; next=next->next) + pktgen_stop_device(next); + + if_unlock(t); +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t) +{ + struct pktgen_dev *cur, *next = NULL; + + /* Remove all devices, free mem */ + + if_lock(t); + + for(cur=t->if_list; cur; cur=next) { + next = cur->next; + pktgen_remove_device(t, cur); + } + + if_unlock(t); +} + +static void pktgen_rem_thread(struct pktgen_thread *t) +{ + /* Remove from the thread list */ + + struct pktgen_thread *tmp = pktgen_threads; + + if (strlen(t->fname)) + remove_proc_entry(t->fname, NULL); + + thread_lock(); + + if (tmp == t) + pktgen_threads = tmp->next; + else { + while (tmp) { + if (tmp->next == t) { + tmp->next = t->next; + t->next = NULL; + break; + } + tmp = tmp->next; + } + } + thread_unlock(); +} + +static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ + struct net_device *odev = NULL; + __u64 idle_start = 0; + int ret; + + odev = pkt_dev->odev; + + if (pkt_dev->delay_us || pkt_dev->delay_ns) { + u64 now; + + now = getCurUs(); + if (now < pkt_dev->next_tx_us) + spin(pkt_dev, pkt_dev->next_tx_us); + + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (pkt_dev->delay_us == 0x7FFFFFFF) { + pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; + pkt_dev->next_tx_ns = pkt_dev->delay_ns; + goto out; + } + } + + if (netif_queue_stopped(odev) || need_resched()) { + idle_start = getCurUs(); + + if (!netif_running(odev)) { + pktgen_stop_device(pkt_dev); + goto out; + } + if (need_resched()) + schedule(); + + pkt_dev->idle_acc += getCurUs() - idle_start; + + if (netif_queue_stopped(odev)) { + pkt_dev->next_tx_us = getCurUs(); /* TODO */ + pkt_dev->next_tx_ns = 0; + goto out; /* Try the next interface */ + } + } + + if (pkt_dev->last_ok || !pkt_dev->skb) { + if ((++pkt_dev->clone_count >= pkt_dev->clone_skb ) || (!pkt_dev->skb)) { + /* build a new pkt */ + if (pkt_dev->skb) + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + goto out; + } + pkt_dev->allocated_skbs++; + pkt_dev->clone_count = 0; /* reset counter */ + } + } + + spin_lock_bh(&odev->xmit_lock); + if (!netif_queue_stopped(odev)) { + + atomic_inc(&(pkt_dev->skb->users)); +retry_now: + ret = odev->hard_start_xmit(pkt_dev->skb, odev); + if (likely(ret == NETDEV_TX_OK)) { + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + + } else if (ret == NETDEV_TX_LOCKED + && (odev->features & NETIF_F_LLTX)) { + cpu_relax(); + goto retry_now; + } else { /* Retry it next time */ + + atomic_dec(&(pkt_dev->skb->users)); + + if (debug && net_ratelimit()) + printk(KERN_INFO "pktgen: Hard xmit error\n"); + + pkt_dev->errors++; + pkt_dev->last_ok = 0; + } + + pkt_dev->next_tx_us = getCurUs(); + pkt_dev->next_tx_ns = 0; + + pkt_dev->next_tx_us += pkt_dev->delay_us; + pkt_dev->next_tx_ns += pkt_dev->delay_ns; + + if (pkt_dev->next_tx_ns > 1000) { + pkt_dev->next_tx_us++; + pkt_dev->next_tx_ns -= 1000; + } + } + + else { /* Retry it next time */ + pkt_dev->last_ok = 0; + pkt_dev->next_tx_us = getCurUs(); /* TODO */ + pkt_dev->next_tx_ns = 0; + } + + spin_unlock_bh(&odev->xmit_lock); + + /* If pkt_dev->count is zero, then run forever */ + if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { + if (atomic_read(&(pkt_dev->skb->users)) != 1) { + idle_start = getCurUs(); + while (atomic_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) { + break; + } + schedule(); + } + pkt_dev->idle_acc += getCurUs() - idle_start; + } + + /* Done with this */ + pktgen_stop_device(pkt_dev); + } + out:; + } + +/* + * Main loop of the thread goes here + */ + +static void pktgen_thread_worker(struct pktgen_thread *t) +{ + DEFINE_WAIT(wait); + struct pktgen_dev *pkt_dev = NULL; + int cpu = t->cpu; + sigset_t tmpsig; + u32 max_before_softirq; + u32 tx_since_softirq = 0; + + daemonize("pktgen/%d", cpu); + + /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */ + + spin_lock_irq(¤t->sighand->siglock); + tmpsig = current->blocked; + siginitsetinv(¤t->blocked, + sigmask(SIGKILL) | + sigmask(SIGSTOP)| + sigmask(SIGTERM)); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* Migrate to the right CPU */ + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (smp_processor_id() != cpu) + BUG(); + + init_waitqueue_head(&t->queue); + + t->control &= ~(T_TERMINATE); + t->control &= ~(T_RUN); + t->control &= ~(T_STOP); + t->control &= ~(T_REMDEV); + + t->pid = current->pid; + + PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid)); + + max_before_softirq = t->max_before_softirq; + + __set_current_state(TASK_INTERRUPTIBLE); + mb(); + + while (1) { + + __set_current_state(TASK_RUNNING); + + /* + * Get next dev to xmit -- if any. + */ + + pkt_dev = next_to_run(t); + + if (pkt_dev) { + + pktgen_xmit(pkt_dev); + + /* + * We like to stay RUNNING but must also give + * others fair share. + */ + + tx_since_softirq += pkt_dev->last_ok; + + if (tx_since_softirq > max_before_softirq) { + if (local_softirq_pending()) + do_softirq(); + tx_since_softirq = 0; + } + } else { + prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE); + schedule_timeout(HZ/10); + finish_wait(&(t->queue), &wait); + } + + /* + * Back from sleep, either due to the timeout or signal. + * We check if we have any "posted" work for us. + */ + + if (t->control & T_TERMINATE || signal_pending(current)) + /* we received a request to terminate ourself */ + break; + + + if(t->control & T_STOP) { + pktgen_stop(t); + t->control &= ~(T_STOP); + } + + if(t->control & T_RUN) { + pktgen_run(t); + t->control &= ~(T_RUN); + } + + if(t->control & T_REMDEV) { + pktgen_rem_all_ifs(t); + t->control &= ~(T_REMDEV); + } + + if (need_resched()) + schedule(); + } + + PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name)); + pktgen_stop(t); + + PG_DEBUG(printk("pktgen: %s removing all device\n", t->name)); + pktgen_rem_all_ifs(t); + + PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name)); + pktgen_rem_thread(t); +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* ifname) +{ + struct pktgen_dev *pkt_dev = NULL; + if_lock(t); + + for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { + if (strcmp(pkt_dev->ifname, ifname) == 0) { + break; + } + } + + if_unlock(t); + PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname,pkt_dev)); + return pkt_dev; +} + +/* + * Adds a dev at front of if_list. + */ + +static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + int rv = 0; + + if_lock(t); + + if (pkt_dev->pg_thread) { + printk("pktgen: ERROR: already assigned to a thread.\n"); + rv = -EBUSY; + goto out; + } + pkt_dev->next =t->if_list; t->if_list=pkt_dev; + pkt_dev->pg_thread = t; + pkt_dev->running = 0; + + out: + if_unlock(t); + return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char* ifname) +{ + struct pktgen_dev *pkt_dev; + + /* We don't allow a device to be on several threads */ + + if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) { + + pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + if (!pkt_dev) + return -ENOMEM; + + memset(pkt_dev, 0, sizeof(struct pktgen_dev)); + + pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state)); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state)); + + pkt_dev->min_pkt_size = ETH_ZLEN; + pkt_dev->max_pkt_size = ETH_ZLEN; + pkt_dev->nfrags = 0; + pkt_dev->clone_skb = pg_clone_skb_d; + pkt_dev->delay_us = pg_delay_d / 1000; + pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + + strncpy(pkt_dev->ifname, ifname, 31); + sprintf(pkt_dev->fname, "net/%s/%s", PG_PROC_DIR, ifname); + + if (! pktgen_setup_dev(pkt_dev)) { + printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -ENODEV; + } + + pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL); + if (!pkt_dev->proc_ent) { + printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -EINVAL; + } + pkt_dev->proc_ent->read_proc = proc_if_read; + pkt_dev->proc_ent->write_proc = proc_if_write; + pkt_dev->proc_ent->data = (void*)(pkt_dev); + pkt_dev->proc_ent->owner = THIS_MODULE; + + return add_dev_to_thread(t, pkt_dev); + } + else { + printk("pktgen: ERROR: interface already used.\n"); + return -EBUSY; + } +} + +static struct pktgen_thread *pktgen_find_thread(const char* name) +{ + struct pktgen_thread *t = NULL; + + thread_lock(); + + t = pktgen_threads; + while (t) { + if (strcmp(t->name, name) == 0) + break; + + t = t->next; + } + thread_unlock(); + return t; +} + +static int pktgen_create_thread(const char* name, int cpu) +{ + struct pktgen_thread *t = NULL; + + if (strlen(name) > 31) { + printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n"); + return -EINVAL; + } + + if (pktgen_find_thread(name)) { + printk("pktgen: ERROR: thread: %s already exists\n", name); + return -EINVAL; + } + + t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL)); + if (!t) { + printk("pktgen: ERROR: out of memory, can't create new thread.\n"); + return -ENOMEM; + } + + memset(t, 0, sizeof(struct pktgen_thread)); + strcpy(t->name, name); + spin_lock_init(&t->if_lock); + t->cpu = cpu; + + sprintf(t->fname, "net/%s/%s", PG_PROC_DIR, t->name); + t->proc_ent = create_proc_entry(t->fname, 0600, NULL); + if (!t->proc_ent) { + printk("pktgen: cannot create %s procfs entry.\n", t->fname); + kfree(t); + return -EINVAL; + } + t->proc_ent->read_proc = proc_thread_read; + t->proc_ent->write_proc = proc_thread_write; + t->proc_ent->data = (void*)(t); + t->proc_ent->owner = THIS_MODULE; + + t->next = pktgen_threads; + pktgen_threads = t; + + if (kernel_thread((void *) pktgen_thread_worker, (void *) t, + CLONE_FS | CLONE_FILES | CLONE_SIGHAND) < 0) + printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu); + + return 0; +} + +/* + * Removes a device from the thread if_list. + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + struct pktgen_dev *i, *prev = NULL; + + i = t->if_list; + + while(i) { + if(i == pkt_dev) { + if(prev) prev->next = i->next; + else t->if_list = NULL; + break; + } + prev = i; + i=i->next; + } +} + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + + PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev)); + + if (pkt_dev->running) { + printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n"); + pktgen_stop_device(pkt_dev); + } + + /* Dis-associate from the interface */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + /* And update the thread if_list */ + + _rem_dev_from_if_list(t, pkt_dev); + + /* Clean up proc file system */ + + if (strlen(pkt_dev->fname)) + remove_proc_entry(pkt_dev->fname, NULL); + + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return 0; +} + +static int __init pg_init(void) +{ + int cpu; + printk(version); + + module_fname[0] = 0; + + create_proc_dir(); + + sprintf(module_fname, "net/%s/pgctrl", PG_PROC_DIR); + module_proc_ent = create_proc_entry(module_fname, 0600, NULL); + if (!module_proc_ent) { + printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname); + return -EINVAL; + } + + module_proc_ent->proc_fops = &pktgen_fops; + module_proc_ent->data = NULL; + + /* Register us to receive netdevice events */ + register_netdevice_notifier(&pktgen_notifier_block); + + for (cpu = 0; cpu < NR_CPUS ; cpu++) { + char buf[30]; + + if (!cpu_online(cpu)) + continue; + + sprintf(buf, "kpktgend_%i", cpu); + pktgen_create_thread(buf, cpu); + } + return 0; +} + +static void __exit pg_cleanup(void) +{ + wait_queue_head_t queue; + init_waitqueue_head(&queue); + + /* Stop all interfaces & threads */ + + while (pktgen_threads) { + struct pktgen_thread *t = pktgen_threads; + pktgen_threads->control |= (T_TERMINATE); + + wait_event_interruptible_timeout(queue, (t != pktgen_threads), HZ); + } + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&pktgen_notifier_block); + + /* Clean up proc file system */ + + remove_proc_entry(module_fname, NULL); + + remove_proc_dir(); +} + + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se"); +MODULE_DESCRIPTION("Packet Generator tool"); +MODULE_LICENSE("GPL"); +module_param(pg_count_d, int, 0); +module_param(pg_delay_d, int, 0); +module_param(pg_clone_skb_d, int, 0); +module_param(debug, int, 0); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 000000000000..d69ad90e5811 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,711 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/security.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/string.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +DECLARE_MUTEX(rtnl_sem); + +void rtnl_lock(void) +{ + rtnl_shlock(); +} + +int rtnl_lock_interruptible(void) +{ + return down_interruptible(&rtnl_sem); +} + +void rtnl_unlock(void) +{ + rtnl_shunlock(); + + netdev_run_todo(); +} + +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + +struct sock *rtnl; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +{ + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcamsg)) +}; + +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +{ + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX, + TCAA_MAX +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); +} + +size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) +{ + size_t ret = RTA_PAYLOAD(rta); + char *src = RTA_DATA(rta); + + if (ret > 0 && src[ret - 1] == '\0') + ret--; + if (size > 0) { + size_t len = (ret >= size) ? size - 1 : ret; + memset(dest, 0, size); + memcpy(dest, src, len); + } + return ret; +} + +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct rtattr *mx = (struct rtattr*)skb->tail; + int i; + + RTA_PUT(skb, RTA_METRICS, 0, NULL); + for (i=0; i<RTAX_MAX; i++) { + if (metrics[i]) + RTA_PUT(skb, i+1, sizeof(u32), metrics+i); + } + mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); + return 0; + +rtattr_failure: + skb_trim(skb, (u8*)mx - skb->data); + return -1; +} + + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = change; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + if (1) { + u32 txqlen = dev->tx_queue_len; + RTA_PUT(skb, IFLA_TXQLEN, sizeof(txqlen), &txqlen); + } + + if (1) { + u32 weight = dev->weight; + RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight); + } + + if (1) { + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + RTA_PUT(skb, IFLA_MAP, sizeof(map), &map); + } + + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + + if (1) { + u32 mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + + if (dev->ifindex != dev->iflink) { + u32 iflink = dev->iflink; + RTA_PUT(skb, IFLA_LINK, sizeof(iflink), &iflink); + } + + if (dev->qdisc_sleeping) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + + if (dev->master) { + u32 master = dev->master->ifindex; + RTA_PUT(skb, IFLA_MASTER, sizeof(master), &master); + } + + if (dev->get_stats) { + unsigned long *stats = (unsigned long*)dev->get_stats(dev); + if (stats) { + struct rtattr *a; + __u32 *s; + int i; + int n = sizeof(struct rtnl_link_stats)/4; + + a = __RTA_PUT(skb, IFLA_STATS, n*4); + s = RTA_DATA(a); + for (i=0; i<n; i++) + s[i] = stats[i]; + } + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) + break; + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + + return skb->len; +} + +static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + struct rtattr **ida = arg; + struct net_device *dev; + int err, send_addr_notify = 0; + + if (ifm->ifi_index >= 0) + dev = dev_get_by_index(ifm->ifi_index); + else if (ida[IFLA_IFNAME - 1]) { + char ifname[IFNAMSIZ]; + + if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], + IFNAMSIZ) >= IFNAMSIZ) + return -EINVAL; + dev = dev_get_by_name(ifname); + } else + return -EINVAL; + + if (!dev) + return -ENODEV; + + err = -EINVAL; + + if (ifm->ifi_flags) + dev_change_flags(dev, ifm->ifi_flags); + + if (ida[IFLA_MAP - 1]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!dev->set_config) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto out; + } + + if (ida[IFLA_MAP - 1]->rta_len != RTA_LENGTH(sizeof(*u_map))) + goto out; + + u_map = RTA_DATA(ida[IFLA_MAP - 1]); + + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = dev->set_config(dev, &k_map); + + if (err) + goto out; + } + + if (ida[IFLA_ADDRESS - 1]) { + if (!dev->set_mac_address) { + err = -EOPNOTSUPP; + goto out; + } + if (!netif_device_present(dev)) { + err = -ENODEV; + goto out; + } + if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len)) + goto out; + + err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1])); + if (err) + goto out; + send_addr_notify = 1; + } + + if (ida[IFLA_BROADCAST - 1]) { + if (ida[IFLA_BROADCAST - 1]->rta_len != RTA_LENGTH(dev->addr_len)) + goto out; + memcpy(dev->broadcast, RTA_DATA(ida[IFLA_BROADCAST - 1]), + dev->addr_len); + send_addr_notify = 1; + } + + if (ida[IFLA_MTU - 1]) { + if (ida[IFLA_MTU - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + err = dev_set_mtu(dev, *((u32 *) RTA_DATA(ida[IFLA_MTU - 1]))); + + if (err) + goto out; + + } + + if (ida[IFLA_TXQLEN - 1]) { + if (ida[IFLA_TXQLEN - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + + dev->tx_queue_len = *((u32 *) RTA_DATA(ida[IFLA_TXQLEN - 1])); + } + + if (ida[IFLA_WEIGHT - 1]) { + if (ida[IFLA_WEIGHT - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + + dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1])); + } + + if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) { + char ifname[IFNAMSIZ]; + + if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], + IFNAMSIZ) >= IFNAMSIZ) + goto out; + err = dev_change_name(dev, ifname); + if (err) + goto out; + } + + err = 0; + +out: + if (send_addr_notify) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + + dev_put(dev); + return err; +} + +static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idx<NPROTO; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb)) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + + sizeof(struct rtnl_link_ifmap) + + sizeof(struct rtnl_link_stats) + 128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ + return 0; +} + +/* Protected by RTNL sempahore. */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + /* Only requests are handled by kernel now */ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < RTM_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > RTM_MAX) + goto err_inval; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family >= NPROTO) { + *errp = -EAFNOSUPPORT; + return -1; + } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[PF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && security_netlink_recv(skb)) { + *errp = -EPERM; + return -1; + } + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + u32 rlen; + + if (link->dumpit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + + if (link->dumpit == NULL) + goto err_inval; + + if ((*errp = netlink_dump_start(rtnl, skb, nlh, + link->dumpit, + rtnetlink_done)) != 0) { + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta_buf[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + if (link->doit == NULL) + goto err_inval; + err = link->doit(skb, nlh, (void *)&rta_buf[0]); + + *errp = err; + return err; + +err_inval: + *errp = -EINVAL; + return -1; +} + +/* + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +static inline int rtnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (rtnetlink_rcv_msg(skb, nlh, &err)) { + /* Not error, but we must interrupt processing here: + * Note, that in this case we do not pull message + * from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags&NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +/* + * rtnetlink input queue processing routine: + * - try to acquire shared lock. If it is failed, defer processing. + * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, + * that will occur, when a dump started and/or acquisition of + * exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + up(&rtnl_sem); + + netdev_run_todo(); + } while (rtnl && rtnl->sk_receive_queue.qlen); +} + +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, + [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, + [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, + [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add }, + [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete }, + [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info } +}; + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + break; + case NETDEV_REGISTER: + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + break; + case NETDEV_UP: + case NETDEV_DOWN: + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + break; + case NETDEV_CHANGE: + case NETDEV_GOING_DOWN: + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + +void __init rtnetlink_init(void) +{ + int i; + + rtattr_max = 0; + for (i = 0; i < ARRAY_SIZE(rta_max); i++) + if (rta_max[i] > rtattr_max) + rtattr_max = rta_max[i]; + rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); + if (!rta_buf) + panic("rtnetlink_init: cannot allocate rta_buf\n"); + + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + if (rtnl == NULL) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); + register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[PF_PACKET] = link_rtnetlink_table; +} + +EXPORT_SYMBOL(__rta_fill); +EXPORT_SYMBOL(rtattr_strlcpy); +EXPORT_SYMBOL(rtattr_parse); +EXPORT_SYMBOL(rtnetlink_links); +EXPORT_SYMBOL(rtnetlink_put_metrics); +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_lock_interruptible); +EXPORT_SYMBOL(rtnl_sem); +EXPORT_SYMBOL(rtnl_unlock); diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 000000000000..a2ebf30f6aa8 --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,291 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/security.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/compat.h> +#include <net/scm.h> + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || + creds->gid == current->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > SCM_MAX_FD) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } +} + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (!CMSG_OK(msg, cmsg)) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + return 0; + +error: + scm_destroy(p); + return err; +} + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (MSG_CMSG_COMPAT & msg->msg_flags) + return put_cmsg_compat(msg, level, type, len, data); + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control; + + int fdmax = 0; + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (MSG_CMSG_COMPAT & msg->msg_flags) { + scm_detach_fds_compat(msg, scm); + return; + } + + if (msg->msg_controllen > sizeof(struct cmsghdr)) + fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) + / sizeof(int)); + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) + { + int new_fd; + err = security_file_receive(fp[i]); + if (err) + break; + err = get_unused_fd(); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + get_file(fp[i]); + fd_install(new_fd, fp[i]); + } + + if (i > 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + if (!err) + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum || (fdnum && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + for (i=fpl->count-1; i>=0; i--) + get_file(fpl->fp[i]); + memcpy(new_fpl, fpl, sizeof(*fpl)); + } + return new_fpl; +} + +EXPORT_SYMBOL(__scm_destroy); +EXPORT_SYMBOL(__scm_send); +EXPORT_SYMBOL(put_cmsg); +EXPORT_SYMBOL(scm_detach_fds); +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 000000000000..bf02ca9f80ac --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,1460 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> + * Florian La Roche <rzsfl@rz.uni-sb.de> + * + * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#ifdef CONFIG_NET_CLS_ACT +#include <net/pkt_sched.h> +#endif +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/cache.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/highmem.h> + +#include <net/protocol.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <net/xfrm.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +static kmem_cache_t *skbuff_head_cache; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +nodata: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + +/** + * alloc_skb_from_cache - allocate a network buffer + * @cp: kmem_cache from which to allocate the data area + * (object size must be big enough for @size bytes + skb overheads) + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new &sk_buff. The returned buffer has no headroom and + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, + unsigned int size, int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + /* Get the DATA. */ + size = SKB_DATA_ALIGN(size); + data = kmem_cache_alloc(cp, gfp_mask); + if (!data) + goto nodata; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +nodata: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + + +static void skb_drop_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + skb_shinfo(skb)->frag_list = NULL; + + do { + struct sk_buff *this = list; + list = list->next; + kfree_skb(this); + } while (list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + for (list = skb_shinfo(skb)->frag_list; list; list = list->next) + skb_get(list); +} + +void skb_release_data(struct sk_buff *skb) +{ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + } + + if (skb_shinfo(skb)->frag_list) + skb_drop_fraglist(skb); + + kfree(skb->head); + } +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ + skb_release_data(skb); + kmem_cache_free(skbuff_head_cache, skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) { + printk(KERN_WARNING "Warning: kfree_skb passed an skb still " + "on a list (from %p).\n", NET_CALLER(skb)); + BUG(); + } + + dst_release(skb->dst); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + if(skb->destructor) { + if (in_irq()) + printk(KERN_WARNING "Warning: kfree_skb on " + "hard IRQ %p\n", NET_CALLER(skb)); + skb->destructor(skb); + } +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(skb->nf_bridge); +#endif +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; + skb->tc_classid = 0; +#endif +#endif + + kfree_skbmem(skb); +} + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + + if (!n) + return NULL; + +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->list = NULL; + n->sk = NULL; + C(stamp); + C(dev); + C(real_dev); + C(h); + C(nh); + C(mac); + C(dst); + dst_clone(skb->dst); + C(sp); +#ifdef CONFIG_INET + secpath_get(skb->sp); +#endif + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + C(len); + C(data_len); + C(csum); + C(local_df); + n->cloned = 1; + n->nohdr = 0; + C(pkt_type); + C(ip_summed); + C(priority); + C(protocol); + C(security); + n->destructor = NULL; +#ifdef CONFIG_NETFILTER + C(nfmark); + C(nfcache); + C(nfct); + nf_conntrack_get(skb->nfct); + C(nfctinfo); +#ifdef CONFIG_NETFILTER_DEBUG + C(nf_debug); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + C(nf_bridge); + nf_bridge_get(skb->nf_bridge); +#endif +#endif /*CONFIG_NETFILTER*/ +#if defined(CONFIG_HIPPI) + C(private); +#endif +#ifdef CONFIG_NET_SCHED + C(tc_index); +#ifdef CONFIG_NET_CLS_ACT + n->tc_verd = SET_TC_VERD(skb->tc_verd,0); + n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); + n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); + C(input_dev); + C(tc_classid); +#endif + +#endif + C(truesize); + atomic_set(&n->users, 1); + C(head); + C(data); + C(tail); + C(end); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +} + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; + + new->list = NULL; + new->sk = NULL; + new->dev = old->dev; + new->real_dev = old->real_dev; + new->priority = old->priority; + new->protocol = old->protocol; + new->dst = dst_clone(old->dst); +#ifdef CONFIG_INET + new->sp = secpath_get(old->sp); +#endif + new->h.raw = old->h.raw + offset; + new->nh.raw = old->nh.raw + offset; + new->mac.raw = old->mac.raw + offset; + memcpy(new->cb, old->cb, sizeof(old->cb)); + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; + new->stamp = old->stamp; + new->destructor = NULL; + new->security = old->security; +#ifdef CONFIG_NETFILTER + new->nfmark = old->nfmark; + new->nfcache = old->nfcache; + new->nfct = old->nfct; + nf_conntrack_get(old->nfct); + new->nfctinfo = old->nfctinfo; +#ifdef CONFIG_NETFILTER_DEBUG + new->nf_debug = old->nf_debug; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + new->nf_bridge = old->nf_bridge; + nf_bridge_get(old->nf_bridge); +#endif +#endif +#ifdef CONFIG_NET_SCHED +#ifdef CONFIG_NET_CLS_ACT + new->tc_verd = old->tc_verd; +#endif + new->tc_index = old->tc_index; +#endif + atomic_set(&new->users, 1); + skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; + skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) +{ + int headerlen = skb->data - skb->head; + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, + gfp_mask); + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + n->csum = skb->csum; + n->ip_summed = skb->ip_summed; + + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) + BUG(); + + copy_skb_header(n, skb); + return n; +} + + +/** + * pskb_copy - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, skb->data - skb->head); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + memcpy(n->data, skb->data, n->len); + n->csum = skb->csum; + n->ip_summed = skb->ip_summed; + + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_shinfo(skb)->frag_list) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) +{ + int i; + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. */ + memcpy(data + nhead, skb->head, skb->tail - skb->head); + memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + + skb_release_data(skb); + + off = (data + nhead) - skb->head; + + skb->head = data; + skb->end = data + size; + skb->data += off; + skb->tail += off; + skb->mac.raw += off; + skb->h.raw += off; + skb->nh.raw += off; + skb->cloned = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; + +nodata: + return -ENOMEM; +} + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} + + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + * + * BUG ALERT: ip_summed is not copied. Why does this work? Is it used + * only by netfilter in the cases when checksum is recalculated? --ANK + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int head_copy_len, head_copy_off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = skb_headroom(skb); + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + return n; +} + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return NULL in out of memory cases. + */ + +struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +{ + struct sk_buff *nskb; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return skb; + } + + nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); + kfree_skb(skb); + if (nskb) + memset(nskb->data+nskb->len, 0, pad); + return nskb; +} + +/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. + * If realloc==0 and trimming is impossible without change of data, + * it is BUG(). + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +{ + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + + for (i = 0; i < nfrags; i++) { + int end = offset + skb_shinfo(skb)->frags[i].size; + if (end > len) { + if (skb_cloned(skb)) { + if (!realloc) + BUG(); + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + } + if (len <= offset) { + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->nr_frags--; + } else { + skb_shinfo(skb)->frags[i].size = len - offset; + } + } + offset = end; + } + + if (offset < len) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + if (len <= skb_headlen(skb)) { + skb->len = len; + skb->data_len = 0; + skb->tail = skb->data + len; + if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) + skb_drop_fraglist(skb); + } else { + skb->data_len -= skb->len - len; + skb->len = len; + } + } + + return 0; +} + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) + BUG(); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_shinfo(skb)->frag_list) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size >= eat) + goto pull_pages; + eat -= skb_shinfo(skb)->frags[i].size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) + BUG(); + + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + if (clone) + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail += delta; + skb->data_len -= delta; + + return skb->tail; +} + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int i, copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + memcpy(to, skb->data + offset, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(to, + vaddr + skb_shinfo(skb)->frags[i].page_offset+ + offset - start, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(list, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +/* Checksum skb data. */ + +unsigned int skb_checksum(const struct sk_buff *skb, int offset, + int len, unsigned int csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial(skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial(vaddr + frag->page_offset + + offset - start, copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + unsigned int csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(list, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + } + if (len) + BUG(); + + return csum; +} + +/* Both of above in one bottle. */ + +unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len, unsigned int csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial_copy_nocheck(vaddr + + frag->page_offset + + offset - start, to, + copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + unsigned int csum2; + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(list, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + } + if (len) + BUG(); + return csum; +} + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + unsigned int csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_HW) + csstart = skb->h.raw - skb->data; + else + csstart = skb_headlen(skb); + + if (csstart > skb_headlen(skb)) + BUG(); + + memcpy(to, skb->data, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart, 0); + + if (skb->ip_summed == CHECKSUM_HW) { + long csstuff = csstart + skb->csum; + + *((unsigned short *)(to + csstuff)) = csum_fold(csum); + } +} + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * + * Works even without knowing the list it is sitting on, which can be + * handy at times. It also means that THE LIST MUST EXIST when you + * unlink. Thus a list must have its contents unlinked before it is + * destroyed. + */ +void skb_unlink(struct sk_buff *skb) +{ + struct sk_buff_head *list = skb->list; + + if (list) { + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + if (skb->list == list) + __skb_unlink(skb, skb->list); + spin_unlock_irqrestore(&list->lock, flags); + } +} + + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_append(old, newsk); + spin_unlock_irqrestore(&old->list->lock, flags); +} + + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * + * Place a packet before a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_insert(newsk, old->prev, old, old->list); + spin_unlock_irqrestore(&old->list->lock, flags); +} + +#if 0 +/* + * Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ + /* Must match allocation in alloc_skb */ + mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); + + kmem_add_cache_size(mtu); +} +#endif + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len); + + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb->tail = skb->data + len; +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_shinfo(skb)->frags[i].size; + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_shinfo(skb1)->frags[0].size -= len - pos; + skb_shinfo(skb)->frags[i].size = len - pos; + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_head_cache) + panic("cannot create skbuff cache"); +} + +EXPORT_SYMBOL(___pskb_trim); +EXPORT_SYMBOL(__kfree_skb); +EXPORT_SYMBOL(__pskb_pull_tail); +EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(pskb_expand_head); +EXPORT_SYMBOL(skb_checksum); +EXPORT_SYMBOL(skb_clone); +EXPORT_SYMBOL(skb_clone_fraglist); +EXPORT_SYMBOL(skb_copy); +EXPORT_SYMBOL(skb_copy_and_csum_bits); +EXPORT_SYMBOL(skb_copy_and_csum_dev); +EXPORT_SYMBOL(skb_copy_bits); +EXPORT_SYMBOL(skb_copy_expand); +EXPORT_SYMBOL(skb_over_panic); +EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(skb_realloc_headroom); +EXPORT_SYMBOL(skb_under_panic); +EXPORT_SYMBOL(skb_dequeue); +EXPORT_SYMBOL(skb_dequeue_tail); +EXPORT_SYMBOL(skb_insert); +EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_head); +EXPORT_SYMBOL(skb_queue_tail); +EXPORT_SYMBOL(skb_unlink); +EXPORT_SYMBOL(skb_append); +EXPORT_SYMBOL(skb_split); diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 000000000000..629ab4a5b45b --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,1565 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * Chris Evans : Security fixes - signedness again + * Arnaldo C. Melo : cleanups, use skb_queue_purge + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/poll.h> +#include <linux/tcp.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/netdevice.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/xfrm.h> +#include <linux/ipsec.h> + +#include <linux/filter.h> + +#ifdef CONFIG_INET +#include <net/tcp.h> +#endif + +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max = SK_WMEM_MAX; +__u32 sysctl_rmem_max = SK_RMEM_MAX; +__u32 sysctl_wmem_default = SK_WMEM_MAX; +__u32 sysctl_rmem_default = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancilliary data plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +{ + struct timeval tv; + + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + + *timeo_p = MAX_SCHEDULE_TIMEOUT; + if (tv.tv_sec == 0 && tv.tv_usec == 0) + return 0; + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) + *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + return 0; +} + +static void sock_warn_obsolete_bsdism(const char *name) +{ + static int warned; + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +} + +static void sock_disable_timestamp(struct sock *sk) +{ + if (sock_flag(sk, SOCK_TIMESTAMP)) { + sock_reset_flag(sk, SOCK_TIMESTAMP); + net_disable_timestamp(); + } +} + + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk=sock->sk; + struct sk_filter *filter; + int val; + int valbool; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + +#ifdef SO_DONTLINGER /* Compatibility item... */ + switch (optname) { + case SO_DONTLINGER: + sock_reset_flag(sk, SOCK_LINGER); + return 0; + } +#endif + + if(optlen<sizeof(int)) + return(-EINVAL); + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val?1:0; + + lock_sock(sk); + + switch(optname) + { + case SO_DEBUG: + if(val && !capable(CAP_NET_ADMIN)) + { + ret = -EACCES; + } + else if (valbool) + sock_set_flag(sk, SOCK_DBG); + else + sock_reset_flag(sk, SOCK_DBG); + break; + case SO_REUSEADDR: + sk->sk_reuse = valbool; + break; + case SO_TYPE: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + if (valbool) + sock_set_flag(sk, SOCK_LOCALROUTE); + else + sock_reset_flag(sk, SOCK_LOCALROUTE); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; + + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + if ((val * 2) < SOCK_MIN_SNDBUF) + sk->sk_sndbuf = SOCK_MIN_SNDBUF; + else + sk->sk_sndbuf = val * 2; + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->sk_write_space(sk); + break; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; + + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* FIXME: is this lower bound the right one? */ + if ((val * 2) < SOCK_MIN_RCVBUF) + sk->sk_rcvbuf = SOCK_MIN_RCVBUF; + else + sk->sk_rcvbuf = val * 2; + break; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->sk_protocol == IPPROTO_TCP) + tcp_set_keepalive(sk, valbool); +#endif + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if(optlen<sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ + break; + } + if (copy_from_user(&ling,optval,sizeof(ling))) { + ret = -EFAULT; + break; + } + if (!ling.l_onoff) + sock_reset_flag(sk, SOCK_LINGER); + else { +#if (BITS_PER_LONG == 32) + if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else +#endif + sk->sk_lingertime = ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + if (valbool) { + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk); + } else + sock_reset_flag(sk, SOCK_RCVTSTAMP); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; + +#ifdef CONFIG_NETDEVICES + case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + break; + } + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + + if (!valbool) { + sk->sk_bound_dev_if = 0; + } else { + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) { + ret = -EFAULT; + break; + } + + /* Remove any cached route for this socket. */ + sk_dst_reset(sk); + + if (devname[0] == '\0') { + sk->sk_bound_dev_if = 0; + } else { + struct net_device *dev = dev_get_by_name(devname); + if (!dev) { + ret = -ENODEV; + break; + } + sk->sk_bound_dev_if = dev->ifindex; + dev_put(dev); + } + } + break; + } +#endif + + + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + spin_lock_bh(&sk->sk_lock.slock); + filter = sk->sk_filter; + if (filter) { + sk->sk_filter = NULL; + spin_unlock_bh(&sk->sk_lock.slock); + sk_filter_release(sk, filter); + break; + } + spin_unlock_bh(&sk->sk_lock.slock); + ret = -ENONET; + break; + + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + default: + ret = -ENOPROTOOPT; + break; + } + release_sock(sk); + return ret; +} + + +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + union + { + int val; + struct linger ling; + struct timeval tm; + } v; + + unsigned int lv = sizeof(int); + int len; + + if(get_user(len,optlen)) + return -EFAULT; + if(len < 0) + return -EINVAL; + + switch(optname) + { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = !!sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_KEEPALIVE: + v.val = !!sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if(v.val==0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = !!sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP); + break; + + case SO_RCVTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; + + case SO_SNDLOWAT: + v.val=1; + break; + + case SO_PASSCRED: + v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; + break; + + case SO_PEERCRED: + if (len > sizeof(sk->sk_peercred)) + len = sizeof(sk->sk_peercred); + if (copy_to_user(optval, &sk->sk_peercred, len)) + return -EFAULT; + goto lenout; + + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } + + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; + + case SO_PEERSEC: + return security_socket_getpeersec(sock, optval, optlen, len); + + default: + return(-ENOPROTOOPT); + } + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) + return -EFAULT; +lenout: + if (put_user(len, optlen)) + return -EFAULT; + return 0; +} + +/** + * sk_alloc - All socket objects are allocated here + * @family - protocol family + * @priority - for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot - struct proto associated with this new sock instance + * @zero_it - if we should zero the newly allocated sock + */ +struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) +{ + struct sock *sk = NULL; + kmem_cache_t *slab = prot->slab; + + if (slab != NULL) + sk = kmem_cache_alloc(slab, priority); + else + sk = kmalloc(prot->obj_size, priority); + + if (sk) { + if (zero_it) { + memset(sk, 0, prot->obj_size); + sk->sk_family = family; + sk->sk_prot = prot; + sock_lock_init(sk); + } + + if (security_sk_alloc(sk, family, priority)) { + kmem_cache_free(slab, sk); + sk = NULL; + } else + __module_get(prot->owner); + } + return sk; +} + +void sk_free(struct sock *sk) +{ + struct sk_filter *filter; + struct module *owner = sk->sk_prot->owner; + + if (sk->sk_destruct) + sk->sk_destruct(sk); + + filter = sk->sk_filter; + if (filter) { + sk_filter_release(sk, filter); + sk->sk_filter = NULL; + } + + sock_disable_timestamp(sk); + + if (atomic_read(&sk->sk_omem_alloc)) + printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", + __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + + security_sk_free(sk); + if (sk->sk_prot->slab != NULL) + kmem_cache_free(sk->sk_prot->slab, sk); + else + kfree(sk); + module_put(owner); +} + +void __init sk_init(void) +{ + if (num_physpages <= 4096) { + sysctl_wmem_max = 32767; + sysctl_rmem_max = 32767; + sysctl_wmem_default = 32767; + sysctl_rmem_default = 32767; + } else if (num_physpages >= 131072) { + sysctl_wmem_max = 131071; + sysctl_rmem_max = 131071; + } +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + /* In case it might be waiting for more memory. */ + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) + sk->sk_write_space(sk); + sock_put(sk); +} + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + + +int sock_i_uid(struct sock *sk) +{ + int uid; + + read_lock(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; + read_unlock(&sk->sk_callback_lock); + return uid; +} + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + read_lock(&sk->sk_callback_lock); + ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; + read_unlock(&sk->sk_callback_lock); + return ino; +} + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + struct sk_buff * skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_w(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_r(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + if ((unsigned)size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->sk_omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->sk_omem_alloc); + } + return NULL; +} + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree(mem); + atomic_sub(size, &sk->sk_omem_alloc); +} + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock * sk, long timeo) +{ + DEFINE_WAIT(wait); + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + for (;;) { + if (!timeo) + break; + if (signal_pending(current)) + break; + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + break; + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk->sk_sleep, &wait); + return timeo; +} + + +/* + * Generic send/receive buffer handlers + */ + +static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, + unsigned long header_len, + unsigned long data_len, + int noblock, int *errcode) +{ + struct sk_buff *skb; + unsigned int gfp_mask; + long timeo; + int err; + + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + timeo = sock_sndtimeo(sk, noblock); + while (1) { + err = sock_error(sk); + if (err != 0) + goto failure; + + err = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(header_len, sk->sk_allocation); + if (skb) { + int npages; + int i; + + /* No pages, we're done... */ + if (!data_len) + break; + + npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + skb->truesize += data_len; + skb_shinfo(skb)->nr_frags = npages; + for (i = 0; i < npages; i++) { + struct page *page; + skb_frag_t *frag; + + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { + err = -ENOBUFS; + skb_shinfo(skb)->nr_frags = i; + kfree_skb(skb); + goto failure; + } + + frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = 0; + frag->size = (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len); + data_len -= PAGE_SIZE; + } + + /* Full success... */ + break; + } + err = -ENOBUFS; + goto failure; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + } + + skb_set_owner_w(skb, sk); + return skb; + +interrupted: + err = sock_intr_errno(timeo); +failure: + *errcode = err; + return NULL; +} + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +} + +static void __lock_sock(struct sock *sk) +{ + DEFINE_WAIT(wait); + + for(;;) { + prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_bh(&sk->sk_lock.slock); + schedule(); + spin_lock_bh(&sk->sk_lock.slock); + if(!sock_owned_by_user(sk)) + break; + } + finish_wait(&sk->sk_lock.wq, &wait); +} + +static void __release_sock(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + do { + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + bh_unlock_sock(sk); + + do { + struct sk_buff *next = skb->next; + + skb->next = NULL; + sk->sk_backlog_rcv(sk, skb); + + /* + * We are in process context here with softirqs + * disabled, use cond_resched_softirq() to preempt. + * This is safe to do because we've taken the backlog + * queue private: + */ + cond_resched_softirq(); + + skb = next; + } while (skb != NULL); + + bh_lock_sock(sk); + } while((skb = sk->sk_backlog.head) != NULL); +} + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * sk - sock to wait on + * timeo - for how long + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo) +{ + int rc; + DEFINE_WAIT(wait); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk->sk_sleep, &wait); + return rc; +} + +EXPORT_SYMBOL(sk_wait_data); + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} + +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +{ + return 0; +} + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len) +{ + return -EOPNOTSUPP; +} + +int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg(sock, &msg, &iov, 1, size); + kunmap(page); + return res; +} + +/* + * Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_error_report(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk,0,POLL_ERR); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_readable(struct sock *sk, int len) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk,1,POLL_IN); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, 2, POLL_OUT); + } + + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_destruct(struct sock *sk) +{ + if (sk->sk_protinfo) + kfree(sk->sk_protinfo); +} + +void sk_send_sigurg(struct sock *sk) +{ + if (sk->sk_socket && sk->sk_socket->file) + if (send_sigurg(&sk->sk_socket->file->f_owner)) + sk_wake_async(sk, 3, POLL_PRI); +} + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires) +{ + if (!mod_timer(timer, expires)) + sock_hold(sk); +} + +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ + if (timer_pending(timer) && del_timer(timer)) + __sock_put(sk); +} + +EXPORT_SYMBOL(sk_stop_timer); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + sk->sk_send_head = NULL; + + init_timer(&sk->sk_timer); + + sk->sk_allocation = GFP_KERNEL; + sk->sk_rcvbuf = sysctl_rmem_default; + sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_state = TCP_CLOSE; + sk->sk_socket = sock; + + sock_set_flag(sk, SOCK_ZAPPED); + + if(sock) + { + sk->sk_type = sock->type; + sk->sk_sleep = &sock->wait; + sock->sk = sk; + } else + sk->sk_sleep = NULL; + + rwlock_init(&sk->sk_dst_lock); + rwlock_init(&sk->sk_callback_lock); + + sk->sk_state_change = sock_def_wakeup; + sk->sk_data_ready = sock_def_readable; + sk->sk_write_space = sock_def_write_space; + sk->sk_error_report = sock_def_error_report; + sk->sk_destruct = sock_def_destruct; + + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + + sk->sk_peercred.pid = 0; + sk->sk_peercred.uid = -1; + sk->sk_peercred.gid = -1; + sk->sk_write_pending = 0; + sk->sk_rcvlowat = 1; + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + sk->sk_stamp.tv_sec = -1L; + sk->sk_stamp.tv_usec = -1L; + + atomic_set(&sk->sk_refcnt, 1); +} + +void fastcall lock_sock(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&(sk->sk_lock.slock)); + if (sk->sk_lock.owner) + __lock_sock(sk); + sk->sk_lock.owner = (void *)1; + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +EXPORT_SYMBOL(lock_sock); + +void fastcall release_sock(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + if (sk->sk_backlog.tail) + __release_sock(sk); + sk->sk_lock.owner = NULL; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} +EXPORT_SYMBOL(release_sock); + +int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{ + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk); + if (sk->sk_stamp.tv_sec == -1) + return -ENOENT; + if (sk->sk_stamp.tv_sec == 0) + do_gettimeofday(&sk->sk_stamp); + return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ? + -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestamp); + +void sock_enable_timestamp(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_TIMESTAMP)) { + sock_set_flag(sk, SOCK_TIMESTAMP); + net_enable_timestamp(); + } +} +EXPORT_SYMBOL(sock_enable_timestamp); + +/* + * Get a socket option on an socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_getsockopt); + +int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + * Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_setsockopt); + +void sk_common_release(struct sock *sk) +{ + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + + /* + * Observation: when sock_common_release is called, processes have + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. + */ + + sk->sk_prot->unhash(sk); + + /* + * In this point socket cannot receive new packets, but it is possible + * that some packets are in flight because some CPU runs receiver and + * did hash table lookup before we unhashed socket. They will achieve + * receive queue and will be purged by socket destructor. + * + * Also we still have packets pending on receive queue and probably, + * our own packets waiting in device queues. sock_destroy will drain + * receive queue, but transmitted packets will delay socket destruction + * until the last reference will be released. + */ + + sock_orphan(sk); + + xfrm_sk_free_policy(sk); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", + sk, atomic_read(&sk->sk_refcnt)); +#endif + sock_put(sk); +} + +EXPORT_SYMBOL(sk_common_release); + +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +int proto_register(struct proto *prot, int alloc_slab) +{ + int rc = -ENOBUFS; + + write_lock(&proto_list_lock); + + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", + prot->name); + goto out_unlock; + } + } + + list_add(&prot->node, &proto_list); + rc = 0; +out_unlock: + write_unlock(&proto_list_lock); + return rc; +} + +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ + write_lock(&proto_list_lock); + + if (prot->slab != NULL) { + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } + + list_del(&prot->node); + write_unlock(&proto_list_lock); +} + +EXPORT_SYMBOL(proto_unregister); + +#ifdef CONFIG_PROC_FS +static inline struct proto *__proto_head(void) +{ + return list_entry(proto_list.next, struct proto, node); +} + +static inline struct proto *proto_head(void) +{ + return list_empty(&proto_list) ? NULL : __proto_head(); +} + +static inline struct proto *proto_next(struct proto *proto) +{ + return proto->node.next == &proto_list ? NULL : + list_entry(proto->node.next, struct proto, node); +} + +static inline struct proto *proto_get_idx(loff_t pos) +{ + struct proto *proto; + loff_t i = 0; + + list_for_each_entry(proto, &proto_list, node) + if (i++ == pos) + goto out; + + proto = NULL; +out: + return proto; +} + +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&proto_list_lock); + return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? proto_head() : proto_next(v); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&proto_list_lock); +} + +static char proto_method_implemented(const void *method) +{ + return method == NULL ? 'n' : 'y'; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ + seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + proto->name, + proto->obj_size, + proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1, + proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + proto->max_header, + proto->slab == NULL ? "no" : "yes", + module_name(proto->owner), + proto_method_implemented(proto->close), + proto_method_implemented(proto->connect), + proto_method_implemented(proto->disconnect), + proto_method_implemented(proto->accept), + proto_method_implemented(proto->ioctl), + proto_method_implemented(proto->init), + proto_method_implemented(proto->destroy), + proto_method_implemented(proto->shutdown), + proto_method_implemented(proto->setsockopt), + proto_method_implemented(proto->getsockopt), + proto_method_implemented(proto->sendmsg), + proto_method_implemented(proto->recvmsg), + proto_method_implemented(proto->sendpage), + proto_method_implemented(proto->bind), + proto_method_implemented(proto->backlog_rcv), + proto_method_implemented(proto->hash), + proto_method_implemented(proto->unhash), + proto_method_implemented(proto->get_port), + proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", + "protocol", + "size", + "sockets", + "memory", + "press", + "maxhdr", + "slab", + "module", + "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + else + proto_seq_printf(seq, v); + return 0; +} + +static struct seq_operations proto_seq_ops = { + .start = proto_seq_start, + .next = proto_seq_next, + .stop = proto_seq_stop, + .show = proto_seq_show, +}; + +static int proto_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &proto_seq_ops); +} + +static struct file_operations proto_seq_fops = { + .owner = THIS_MODULE, + .open = proto_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proto_init(void) +{ + /* register /proc/net/protocols */ + return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ + +EXPORT_SYMBOL(sk_alloc); +EXPORT_SYMBOL(sk_free); +EXPORT_SYMBOL(sk_send_sigurg); +EXPORT_SYMBOL(sock_alloc_send_skb); +EXPORT_SYMBOL(sock_init_data); +EXPORT_SYMBOL(sock_kfree_s); +EXPORT_SYMBOL(sock_kmalloc); +EXPORT_SYMBOL(sock_no_accept); +EXPORT_SYMBOL(sock_no_bind); +EXPORT_SYMBOL(sock_no_connect); +EXPORT_SYMBOL(sock_no_getname); +EXPORT_SYMBOL(sock_no_getsockopt); +EXPORT_SYMBOL(sock_no_ioctl); +EXPORT_SYMBOL(sock_no_listen); +EXPORT_SYMBOL(sock_no_mmap); +EXPORT_SYMBOL(sock_no_poll); +EXPORT_SYMBOL(sock_no_recvmsg); +EXPORT_SYMBOL(sock_no_sendmsg); +EXPORT_SYMBOL(sock_no_sendpage); +EXPORT_SYMBOL(sock_no_setsockopt); +EXPORT_SYMBOL(sock_no_shutdown); +EXPORT_SYMBOL(sock_no_socketpair); +EXPORT_SYMBOL(sock_rfree); +EXPORT_SYMBOL(sock_setsockopt); +EXPORT_SYMBOL(sock_wfree); +EXPORT_SYMBOL(sock_wmalloc); +EXPORT_SYMBOL(sock_i_uid); +EXPORT_SYMBOL(sock_i_ino); +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_optmem_max); +EXPORT_SYMBOL(sysctl_rmem_max); +EXPORT_SYMBOL(sysctl_wmem_max); +#endif diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 000000000000..1e27a57b5a97 --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,287 @@ +/* + * SUCS NET3: + * + * Generic stream handling routines. These are generic for most + * protocols. Even IP. Tonight 8-). + * This is used because TCP, LLC (others too) layer all have mostly + * identical sendmsg() and recvmsg() code. + * So we (will) share it here. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * (from old tcp.c code) + * Alan Cox <alan@redhat.com> (Borrowed comments 8-)) + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/signal.h> +#include <linux/tcp.h> +#include <linux/wait.h> +#include <net/sock.h> + +/** + * sk_stream_write_space - stream socket write_space callback. + * sk - socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + } +} + +EXPORT_SYMBOL(sk_stream_write_space); + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk - sock to wait on + * @timeo_p - for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + + while (1) { + if (sk->sk_err) + return sock_error(sk); + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + if (sk_wait_event(sk, timeo_p, + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) + break; + finish_wait(sk->sk_sleep, &wait); + sk->sk_write_pending--; + } + return 0; +} + +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk - socket to verify + */ +static inline int sk_stream_closing(struct sock *sk) +{ + return (1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk->sk_sleep, &wait); + } +} + +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk - socket to wait for memory + * @timeo_p - for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + while (1) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sk_stream_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + vm_wait); + sk->sk_write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + finish_wait(sk->sk_sleep, &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} + +EXPORT_SYMBOL(sk_stream_wait_memory); + +void sk_stream_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk->sk_forward_alloc += skb->truesize; +} + +EXPORT_SYMBOL(sk_stream_rfree); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +EXPORT_SYMBOL(sk_stream_error); + +void __sk_stream_mem_reclaim(struct sock *sk) +{ + if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) { + atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM, + sk->sk_prot->memory_allocated); + sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1; + if (*sk->sk_prot->memory_pressure && + (atomic_read(sk->sk_prot->memory_allocated) < + sk->sk_prot->sysctl_mem[0])) + *sk->sk_prot->memory_pressure = 0; + } +} + +EXPORT_SYMBOL(__sk_stream_mem_reclaim); + +int sk_stream_mem_schedule(struct sock *sk, int size, int kind) +{ + int amt = sk_stream_pages(size); + + sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM; + atomic_add(amt, sk->sk_prot->memory_allocated); + + /* Under limit. */ + if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) { + if (*sk->sk_prot->memory_pressure) + *sk->sk_prot->memory_pressure = 0; + return 1; + } + + /* Over hard limit. */ + if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) { + sk->sk_prot->enter_memory_pressure(); + goto suppress_allocation; + } + + /* Under pressure. */ + if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1]) + sk->sk_prot->enter_memory_pressure(); + + if (kind) { + if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0]) + return 1; + } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0]) + return 1; + + if (!*sk->sk_prot->memory_pressure || + sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) * + sk_stream_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + +suppress_allocation: + + if (!kind) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM; + atomic_sub(amt, sk->sk_prot->memory_allocated); + return 0; +} + +EXPORT_SYMBOL(sk_stream_mem_schedule); + +void sk_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + BUG_TRAP(skb_queue_empty(&sk->sk_write_queue)); + + /* Account for returned memory. */ + sk_stream_mem_reclaim(sk); + + BUG_TRAP(!sk->sk_wmem_queued); + BUG_TRAP(!sk->sk_forward_alloc); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +} + +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 000000000000..c8be646cb191 --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,182 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/config.h> +#include <linux/module.h> + +#ifdef CONFIG_SYSCTL + +extern int netdev_max_backlog; +extern int weight_p; +extern int no_cong_thresh; +extern int no_cong; +extern int lo_cong; +extern int mod_cong; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + +extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; +extern int sysctl_somaxconn; + +#ifdef CONFIG_NET_DIVERT +extern char sysctl_divert_version[]; +#endif /* CONFIG_NET_DIVERT */ + +/* + * This strdup() is used for creating copies of network + * device names to be handed over to sysctl. + */ + +char *net_sysctl_strdup(const char *s) +{ + char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); + if (rv) + strcpy(rv, s); + return rv; +} + +ctl_table core_table[] = { +#ifdef CONFIG_NET + { + .ctl_name = NET_CORE_WMEM_MAX, + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_RMEM_MAX, + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_WMEM_DEFAULT, + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_RMEM_DEFAULT, + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_DEV_WEIGHT, + .procname = "dev_weight", + .data = &weight_p, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MAX_BACKLOG, + .procname = "netdev_max_backlog", + .data = &netdev_max_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_NO_CONG_THRESH, + .procname = "no_cong_thresh", + .data = &no_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_NO_CONG, + .procname = "no_cong", + .data = &no_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_LO_CONG, + .procname = "lo_cong", + .data = &lo_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MOD_CONG, + .procname = "mod_cong", + .data = &mod_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MSG_COST, + .procname = "message_cost", + .data = &net_msg_cost, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_CORE_MSG_BURST, + .procname = "message_burst", + .data = &net_msg_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_CORE_OPTMEM_MAX, + .procname = "optmem_max", + .data = &sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#ifdef CONFIG_NET_DIVERT + { + .ctl_name = NET_CORE_DIVERT_VERSION, + .procname = "divert_version", + .data = (void *)sysctl_divert_version, + .maxlen = 32, + .mode = 0444, + .proc_handler = &proc_dostring + }, +#endif /* CONFIG_NET_DIVERT */ +#endif /* CONFIG_NET */ + { + .ctl_name = NET_CORE_SOMAXCONN, + .procname = "somaxconn", + .data = &sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +EXPORT_SYMBOL(net_sysctl_strdup); + +#endif diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 000000000000..e11a8654f363 --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,155 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + + +/* + This is a maximally equidistributed combined Tausworthe generator + based on code from GNU Scientific Library 1.5 (30 Jun 2004) + + x_n = (s1_n ^ s2_n ^ s3_n) + + s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) + s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) + s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + + The period of this generator is about 2^88. + + From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + Generators", Mathematics of Computation, 65, 213 (1996), 203--213. + + This is available on the net from L'Ecuyer's home page, + + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps + ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps + + There is an erratum in the paper "Tables of Maximally + Equidistributed Combined LFSR Generators", Mathematics of + Computation, 68, 225 (1999), 261--269: + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + + ... the k_j most significant bits of z_j must be non- + zero, for each j. (Note: this restriction also applies to the + computer code given in [4], but was mistakenly not mentioned in + that paper.) + + This affects the seeding procedure by imposing the requirement + s1 > 1, s2 > 7, s3 > 15. + +*/ +struct nrnd_state { + u32 s1, s2, s3; +}; + +static DEFINE_PER_CPU(struct nrnd_state, net_rand_state); + +static u32 __net_random(struct nrnd_state *state) +{ +#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b) + + state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); + state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); + state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + + return (state->s1 ^ state->s2 ^ state->s3); +} + +static void __net_srandom(struct nrnd_state *state, unsigned long s) +{ + if (s == 0) + s = 1; /* default seed is 1 */ + +#define LCG(n) (69069 * n) + state->s1 = LCG(s); + state->s2 = LCG(state->s1); + state->s3 = LCG(state->s2); + + /* "warm it up" */ + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); +} + + +unsigned long net_random(void) +{ + unsigned long r; + struct nrnd_state *state = &get_cpu_var(net_rand_state); + r = __net_random(state); + put_cpu_var(state); + return r; +} + + +void net_srandom(unsigned long entropy) +{ + struct nrnd_state *state = &get_cpu_var(net_rand_state); + __net_srandom(state, state->s1^entropy); + put_cpu_var(state); +} + +void __init net_random_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + struct nrnd_state *state = &per_cpu(net_rand_state,i); + __net_srandom(state, i+jiffies); + } +} + +static int net_random_reseed(void) +{ + int i; + unsigned long seed[NR_CPUS]; + + get_random_bytes(seed, sizeof(seed)); + for (i = 0; i < NR_CPUS; i++) { + struct nrnd_state *state = &per_cpu(net_rand_state,i); + __net_srandom(state, seed[i]); + } + return 0; +} +late_initcall(net_random_reseed); + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10; + +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + return __printk_ratelimit(net_msg_cost, net_msg_burst); +} + +EXPORT_SYMBOL(net_random); +EXPORT_SYMBOL(net_ratelimit); +EXPORT_SYMBOL(net_srandom); diff --git a/net/core/wireless.c b/net/core/wireless.c new file mode 100644 index 000000000000..750cc5daeb03 --- /dev/null +++ b/net/core/wireless.c @@ -0,0 +1,1459 @@ +/* + * This file implement the Wireless Extensions APIs. + * + * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> + * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +/************************** DOCUMENTATION **************************/ +/* + * API definition : + * -------------- + * See <linux/wireless.h> for details of the APIs and the rest. + * + * History : + * ------- + * + * v1 - 5.12.01 - Jean II + * o Created this file. + * + * v2 - 13.12.01 - Jean II + * o Move /proc/net/wireless stuff from net/core/dev.c to here + * o Make Wireless Extension IOCTLs go through here + * o Added iw_handler handling ;-) + * o Added standard ioctl description + * o Initial dumb commit strategy based on orinoco.c + * + * v3 - 19.12.01 - Jean II + * o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call + * o Add event dispatcher function + * o Add event description + * o Propagate events as rtnetlink IFLA_WIRELESS option + * o Generate event on selected SET requests + * + * v4 - 18.04.02 - Jean II + * o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1 + * + * v5 - 21.06.02 - Jean II + * o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup) + * o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes + * o Add IWEVCUSTOM for driver specific event/scanning token + * o Turn on WE_STRICT_WRITE by default + kernel warning + * o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num) + * o Fix off-by-one in test (extra_size <= IFNAMSIZ) + * + * v6 - 9.01.03 - Jean II + * o Add common spy support : iw_handler_set_spy(), wireless_spy_update() + * o Add enhanced spy support : iw_handler_set_thrspy() and event. + * o Add WIRELESS_EXT version display in /proc/net/wireless + * + * v6 - 18.06.04 - Jean II + * o Change get_spydata() method for added safety + * o Remove spy #ifdef, they are always on -> cleaner code + * o Allow any size GET request if user specifies length > max + * and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV + * o Start migrating get_wireless_stats to struct iw_handler_def + * o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus + * Based on patch from Pavel Roskin <proski@gnu.org> : + * o Fix kernel data leak to user space in private handler handling + */ + +/***************************** INCLUDES *****************************/ + +#include <linux/config.h> /* Not needed ??? */ +#include <linux/module.h> +#include <linux/types.h> /* off_t */ +#include <linux/netdevice.h> /* struct ifreq, dev_get_by_name() */ +#include <linux/proc_fs.h> +#include <linux/rtnetlink.h> /* rtnetlink stuff */ +#include <linux/seq_file.h> +#include <linux/init.h> /* for __init */ +#include <linux/if_arp.h> /* ARPHRD_ETHER */ + +#include <linux/wireless.h> /* Pretty obvious */ +#include <net/iw_handler.h> /* New driver API */ + +#include <asm/uaccess.h> /* copy_to_user() */ + +/**************************** CONSTANTS ****************************/ + +/* Debugging stuff */ +#undef WE_IOCTL_DEBUG /* Debug IOCTL API */ +#undef WE_EVENT_DEBUG /* Debug Event dispatcher */ +#undef WE_SPY_DEBUG /* Debug enhanced spy support */ + +/* Options */ +#define WE_EVENT_NETLINK /* Propagate events using rtnetlink */ +#define WE_SET_EVENT /* Generate an event on some set commands */ + +/************************* GLOBAL VARIABLES *************************/ +/* + * You should not use global variables, because of re-entrancy. + * On our case, it's only const, so it's OK... + */ +/* + * Meta-data about all the standard Wireless Extension request we + * know about. + */ +static const struct iw_ioctl_description standard_ioctl[] = { + [SIOCSIWCOMMIT - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWNAME - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_CHAR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWNWID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWNWID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWFREQ - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWFREQ - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWMODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWMODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWSENS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWSENS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRANGE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWRANGE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_range), + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWPRIV - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWPRIV - SIOCIWFIRST] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCSIWSTATS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWSTATS - SIOCIWFIRST] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_NULL, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr), + .max_tokens = IW_MAX_SPY, + }, + [SIOCGIWSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_SPY, + }, + [SIOCSIWTHRSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [SIOCGIWTHRSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [SIOCSIWAP - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [SIOCGIWAP - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCGIWAPLIST - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_AP, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [SIOCSIWSCAN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWSCAN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_SCAN_MAX_DATA, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [SIOCSIWESSID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWESSID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWNICKN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + }, + [SIOCGIWNICKN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + }, + [SIOCSIWRATE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRATE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRTS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRTS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWFRAG - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWFRAG - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWTXPOW - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWTXPOW - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRETRY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRETRY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWENCODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, + }, + [SIOCGIWENCODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, + }, + [SIOCSIWPOWER - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWPOWER - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, +}; +static const int standard_ioctl_num = (sizeof(standard_ioctl) / + sizeof(struct iw_ioctl_description)); + +/* + * Meta-data about all the additional standard Wireless Extension events + * we know about. + */ +static const struct iw_ioctl_description standard_event[] = { + [IWEVTXDROP - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IWEVQUAL - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_QUAL, + }, + [IWEVCUSTOM - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_CUSTOM_MAX, + }, + [IWEVREGISTERED - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IWEVEXPIRED - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, +}; +static const int standard_event_num = (sizeof(standard_event) / + sizeof(struct iw_ioctl_description)); + +/* Size (in bytes) of the various private data types */ +static const char iw_priv_type_size[] = { + 0, /* IW_PRIV_TYPE_NONE */ + 1, /* IW_PRIV_TYPE_BYTE */ + 1, /* IW_PRIV_TYPE_CHAR */ + 0, /* Not defined */ + sizeof(__u32), /* IW_PRIV_TYPE_INT */ + sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */ + sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */ + 0, /* Not defined */ +}; + +/* Size (in bytes) of various events */ +static const int event_type_size[] = { + IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */ + 0, + IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ + 0, + IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */ + IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ + IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ + 0, + IW_EV_POINT_LEN, /* Without variable payload */ + IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ + IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ +}; + +/************************ COMMON SUBROUTINES ************************/ +/* + * Stuff that may be used in various place or doesn't fit in one + * of the section below. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the driver handler associated with a specific Wireless Extension. + * Called from various place, so make sure it remains efficient. + */ +static inline iw_handler get_handler(struct net_device *dev, + unsigned int cmd) +{ + /* Don't "optimise" the following variable, it will crash */ + unsigned int index; /* *MUST* be unsigned */ + + /* Check if we have some wireless handlers defined */ + if(dev->wireless_handlers == NULL) + return NULL; + + /* Try as a standard command */ + index = cmd - SIOCIWFIRST; + if(index < dev->wireless_handlers->num_standard) + return dev->wireless_handlers->standard[index]; + + /* Try as a private command */ + index = cmd - SIOCIWFIRSTPRIV; + if(index < dev->wireless_handlers->num_private) + return dev->wireless_handlers->private[index]; + + /* Not found */ + return NULL; +} + +/* ---------------------------------------------------------------- */ +/* + * Get statistics out of the driver + */ +static inline struct iw_statistics *get_wireless_stats(struct net_device *dev) +{ + /* New location */ + if((dev->wireless_handlers != NULL) && + (dev->wireless_handlers->get_wireless_stats != NULL)) + return dev->wireless_handlers->get_wireless_stats(dev); + + /* Old location, will be phased out in next WE */ + return (dev->get_wireless_stats ? + dev->get_wireless_stats(dev) : + (struct iw_statistics *) NULL); +} + +/* ---------------------------------------------------------------- */ +/* + * Call the commit handler in the driver + * (if exist and if conditions are right) + * + * Note : our current commit strategy is currently pretty dumb, + * but we will be able to improve on that... + * The goal is to try to agreagate as many changes as possible + * before doing the commit. Drivers that will define a commit handler + * are usually those that need a reset after changing parameters, so + * we want to minimise the number of reset. + * A cool idea is to use a timer : at each "set" command, we re-set the + * timer, when the timer eventually fires, we call the driver. + * Hopefully, more on that later. + * + * Also, I'm waiting to see how many people will complain about the + * netif_running(dev) test. I'm open on that one... + * Hopefully, the driver will remember to do a commit in "open()" ;-) + */ +static inline int call_commit_handler(struct net_device * dev) +{ + if((netif_running(dev)) && + (dev->wireless_handlers->standard[0] != NULL)) { + /* Call the commit handler on the driver */ + return dev->wireless_handlers->standard[0](dev, NULL, + NULL, NULL); + } else + return 0; /* Command completed successfully */ +} + +/* ---------------------------------------------------------------- */ +/* + * Calculate size of private arguments + */ +static inline int get_priv_size(__u16 args) +{ + int num = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + return num * iw_priv_type_size[type]; +} + +/* ---------------------------------------------------------------- */ +/* + * Re-calculate the size of private arguments + */ +static inline int adjust_priv_size(__u16 args, + union iwreq_data * wrqu) +{ + int num = wrqu->data.length; + int max = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + /* Make sure the driver doesn't goof up */ + if (max < num) + num = max; + + return num * iw_priv_type_size[type]; +} + + +/******************** /proc/net/wireless SUPPORT ********************/ +/* + * The /proc/net/wireless file is a human readable user-space interface + * exporting various wireless specific statistics from the wireless devices. + * This is the most popular part of the Wireless Extensions ;-) + * + * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). + * The content of the file is basically the content of "struct iw_statistics". + */ + +#ifdef CONFIG_PROC_FS + +/* ---------------------------------------------------------------- */ +/* + * Print one entry (line) of /proc/net/wireless + */ +static __inline__ void wireless_seq_printf_stats(struct seq_file *seq, + struct net_device *dev) +{ + /* Get stats from the driver */ + struct iw_statistics *stats = get_wireless_stats(dev); + + if (stats) { + seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " + "%6d %6d %6d\n", + dev->name, stats->status, stats->qual.qual, + stats->qual.updated & IW_QUAL_QUAL_UPDATED + ? '.' : ' ', + ((__u8) stats->qual.level), + stats->qual.updated & IW_QUAL_LEVEL_UPDATED + ? '.' : ' ', + ((__u8) stats->qual.noise), + stats->qual.updated & IW_QUAL_NOISE_UPDATED + ? '.' : ' ', + stats->discard.nwid, stats->discard.code, + stats->discard.fragment, stats->discard.retries, + stats->discard.misc, stats->miss.beacon); + stats->qual.updated = 0; + } +} + +/* ---------------------------------------------------------------- */ +/* + * Print info for /proc/net/wireless (print all entries) + */ +static int wireless_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "Inter-| sta-| Quality | Discarded " + "packets | Missed | WE\n" + " face | tus | link level noise | nwid " + "crypt frag retry misc | beacon | %d\n", + WIRELESS_EXT); + else + wireless_seq_printf_stats(seq, v); + return 0; +} + +extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); +extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); +extern void dev_seq_stop(struct seq_file *seq, void *v); + +static struct seq_operations wireless_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = wireless_seq_show, +}; + +static int wireless_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &wireless_seq_ops); +} + +static struct file_operations wireless_seq_fops = { + .owner = THIS_MODULE, + .open = wireless_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int __init wireless_proc_init(void) +{ + if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops)) + return -ENOMEM; + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +/************************** IOCTL SUPPORT **************************/ +/* + * The original user space API to configure all those Wireless Extensions + * is through IOCTLs. + * In there, we check if we need to call the new driver API (iw_handler) + * or just call the driver ioctl handler. + */ + +/* ---------------------------------------------------------------- */ +/* + * Allow programatic access to /proc/net/wireless even if /proc + * doesn't exist... Also more efficient... + */ +static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr) +{ + /* Get stats from the driver */ + struct iw_statistics *stats; + + stats = get_wireless_stats(dev); + if (stats != (struct iw_statistics *) NULL) { + struct iwreq * wrq = (struct iwreq *)ifr; + + /* Copy statistics to the user buffer */ + if(copy_to_user(wrq->u.data.pointer, stats, + sizeof(struct iw_statistics))) + return -EFAULT; + + /* Check if we need to clear the update flag */ + if(wrq->u.data.flags != 0) + stats->qual.updated = 0; + return 0; + } else + return -EOPNOTSUPP; +} + +/* ---------------------------------------------------------------- */ +/* + * Export the driver private handler definition + * They will be picked up by tools like iwpriv... + */ +static inline int ioctl_export_private(struct net_device * dev, + struct ifreq * ifr) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + + /* Check if the driver has something to export */ + if((dev->wireless_handlers->num_private_args == 0) || + (dev->wireless_handlers->private_args == NULL)) + return -EOPNOTSUPP; + + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + + /* Check if there is enough buffer up there */ + if(iwr->u.data.length < dev->wireless_handlers->num_private_args) { + /* User space can't know in advance how large the buffer + * needs to be. Give it a hint, so that we can support + * any size buffer we want somewhat efficiently... */ + iwr->u.data.length = dev->wireless_handlers->num_private_args; + return -E2BIG; + } + + /* Set the number of available ioctls. */ + iwr->u.data.length = dev->wireless_handlers->num_private_args; + + /* Copy structure to the user buffer. */ + if (copy_to_user(iwr->u.data.pointer, + dev->wireless_handlers->private_args, + sizeof(struct iw_priv_args) * iwr->u.data.length)) + return -EFAULT; + + return 0; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a standard Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + */ +static inline int ioctl_standard_call(struct net_device * dev, + struct ifreq * ifr, + unsigned int cmd, + iw_handler handler) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + const struct iw_ioctl_description * descr; + struct iw_request_info info; + int ret = -EINVAL; + + /* Get the description of the IOCTL */ + if((cmd - SIOCIWFIRST) >= standard_ioctl_num) + return -EOPNOTSUPP; + descr = &(standard_ioctl[cmd - SIOCIWFIRST]); + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n", + ifr->ifr_name, cmd); + printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif /* WE_IOCTL_DEBUG */ + + /* Prepare the call */ + info.cmd = cmd; + info.flags = 0; + + /* Check if we have a pointer to user space data or not */ + if(descr->header_type != IW_HEADER_TYPE_POINT) { + + /* No extra arguments. Trivial to handle */ + ret = handler(dev, &info, &(iwr->u), NULL); + +#ifdef WE_SET_EVENT + /* Generate an event to notify listeners of the change */ + if((descr->flags & IW_DESCR_FLAG_EVENT) && + ((ret == 0) || (ret == -EIWCOMMIT))) + wireless_send_event(dev, cmd, &(iwr->u), NULL); +#endif /* WE_SET_EVENT */ + } else { + char * extra; + int extra_size; + int user_length = 0; + int err; + + /* Calculate space needed by arguments. Always allocate + * for max space. Easier, and won't last long... */ + extra_size = descr->max_tokens * descr->token_size; + + /* Check what user space is giving us */ + if(IW_IS_SET(cmd)) { + /* Check NULL pointer */ + if((iwr->u.data.pointer == NULL) && + (iwr->u.data.length != 0)) + return -EFAULT; + /* Check if number of token fits within bounds */ + if(iwr->u.data.length > descr->max_tokens) + return -E2BIG; + if(iwr->u.data.length < descr->min_tokens) + return -EINVAL; + } else { + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + /* Save user space buffer size for checking */ + user_length = iwr->u.data.length; + + /* Don't check if user_length > max to allow forward + * compatibility. The test user_length < min is + * implied by the test at the end. */ + + /* Support for very large requests */ + if((descr->flags & IW_DESCR_FLAG_NOMAX) && + (user_length > descr->max_tokens)) { + /* Allow userspace to GET more than max so + * we can support any size GET requests. + * There is still a limit : -ENOMEM. */ + extra_size = user_length * descr->token_size; + /* Note : user_length is originally a __u16, + * and token_size is controlled by us, + * so extra_size won't get negative and + * won't overflow... */ + } + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", + dev->name, extra_size); +#endif /* WE_IOCTL_DEBUG */ + + /* Create the kernel buffer */ + extra = kmalloc(extra_size, GFP_KERNEL); + if (extra == NULL) { + return -ENOMEM; + } + + /* If it is a SET, get all the extra data in here */ + if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + err = copy_from_user(extra, iwr->u.data.pointer, + iwr->u.data.length * + descr->token_size); + if (err) { + kfree(extra); + return -EFAULT; + } +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Got %d bytes\n", + dev->name, + iwr->u.data.length * descr->token_size); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Call the handler */ + ret = handler(dev, &info, &(iwr->u), extra); + + /* If we have something to return to the user */ + if (!ret && IW_IS_GET(cmd)) { + /* Check if there is enough buffer up there */ + if(user_length < iwr->u.data.length) { + kfree(extra); + return -E2BIG; + } + + err = copy_to_user(iwr->u.data.pointer, extra, + iwr->u.data.length * + descr->token_size); + if (err) + ret = -EFAULT; +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n", + dev->name, + iwr->u.data.length * descr->token_size); +#endif /* WE_IOCTL_DEBUG */ + } + +#ifdef WE_SET_EVENT + /* Generate an event to notify listeners of the change */ + if((descr->flags & IW_DESCR_FLAG_EVENT) && + ((ret == 0) || (ret == -EIWCOMMIT))) { + if(descr->flags & IW_DESCR_FLAG_RESTRICT) + /* If the event is restricted, don't + * export the payload */ + wireless_send_event(dev, cmd, &(iwr->u), NULL); + else + wireless_send_event(dev, cmd, &(iwr->u), + extra); + } +#endif /* WE_SET_EVENT */ + + /* Cleanup - I told you it wasn't that long ;-) */ + kfree(extra); + } + + /* Call commit handler if needed and defined */ + if(ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + /* Here, we will generate the appropriate event if needed */ + + return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a private Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + * It's not as nice and slimline as the standard wrapper. The cause + * is struct iw_priv_args, which was not really designed for the + * job we are going here. + * + * IMPORTANT : This function prevent to set and get data on the same + * IOCTL and enforce the SET/GET convention. Not doing it would be + * far too hairy... + * If you need to set and get data at the same time, please don't use + * a iw_handler but process it in your ioctl handler (i.e. use the + * old driver API). + */ +static inline int ioctl_private_call(struct net_device * dev, + struct ifreq * ifr, + unsigned int cmd, + iw_handler handler) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + const struct iw_priv_args * descr = NULL; + struct iw_request_info info; + int extra_size = 0; + int i; + int ret = -EINVAL; + + /* Get the description of the IOCTL */ + for(i = 0; i < dev->wireless_handlers->num_private_args; i++) + if(cmd == dev->wireless_handlers->private_args[i].cmd) { + descr = &(dev->wireless_handlers->private_args[i]); + break; + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n", + ifr->ifr_name, cmd); + if(descr) { + printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n", + dev->name, descr->name, + descr->set_args, descr->get_args); + } +#endif /* WE_IOCTL_DEBUG */ + + /* Compute the size of the set/get arguments */ + if(descr != NULL) { + if(IW_IS_SET(cmd)) { + int offset = 0; /* For sub-ioctls */ + /* Check for sub-ioctl handler */ + if(descr->name[0] == '\0') + /* Reserve one int for sub-ioctl index */ + offset = sizeof(__u32); + + /* Size of set arguments */ + extra_size = get_priv_size(descr->set_args); + + /* Does it fits in iwr ? */ + if((descr->set_args & IW_PRIV_SIZE_FIXED) && + ((extra_size + offset) <= IFNAMSIZ)) + extra_size = 0; + } else { + /* Size of get arguments */ + extra_size = get_priv_size(descr->get_args); + + /* Does it fits in iwr ? */ + if((descr->get_args & IW_PRIV_SIZE_FIXED) && + (extra_size <= IFNAMSIZ)) + extra_size = 0; + } + } + + /* Prepare the call */ + info.cmd = cmd; + info.flags = 0; + + /* Check if we have a pointer to user space data or not. */ + if(extra_size == 0) { + /* No extra arguments. Trivial to handle */ + ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u)); + } else { + char * extra; + int err; + + /* Check what user space is giving us */ + if(IW_IS_SET(cmd)) { + /* Check NULL pointer */ + if((iwr->u.data.pointer == NULL) && + (iwr->u.data.length != 0)) + return -EFAULT; + + /* Does it fits within bounds ? */ + if(iwr->u.data.length > (descr->set_args & + IW_PRIV_SIZE_MASK)) + return -E2BIG; + } else { + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", + dev->name, extra_size); +#endif /* WE_IOCTL_DEBUG */ + + /* Always allocate for max space. Easier, and won't last + * long... */ + extra = kmalloc(extra_size, GFP_KERNEL); + if (extra == NULL) { + return -ENOMEM; + } + + /* If it is a SET, get all the extra data in here */ + if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + err = copy_from_user(extra, iwr->u.data.pointer, + extra_size); + if (err) { + kfree(extra); + return -EFAULT; + } +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Got %d elem\n", + dev->name, iwr->u.data.length); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Call the handler */ + ret = handler(dev, &info, &(iwr->u), extra); + + /* If we have something to return to the user */ + if (!ret && IW_IS_GET(cmd)) { + + /* Adjust for the actual length if it's variable, + * avoid leaking kernel bits outside. */ + if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) { + extra_size = adjust_priv_size(descr->get_args, + &(iwr->u)); + } + + err = copy_to_user(iwr->u.data.pointer, extra, + extra_size); + if (err) + ret = -EFAULT; +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n", + dev->name, iwr->u.data.length); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Cleanup - I told you it wasn't that long ;-) */ + kfree(extra); + } + + + /* Call commit handler if needed and defined */ + if(ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Main IOCTl dispatcher. Called from the main networking code + * (dev_ioctl() in net/core/dev.c). + * Check the type of IOCTL and call the appropriate wrapper... + */ +int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) +{ + struct net_device *dev; + iw_handler handler; + + /* Permissions are already checked in dev_ioctl() before calling us. + * The copy_to/from_user() of ifr is also dealt with in there */ + + /* Make sure the device exist */ + if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) + return -ENODEV; + + /* A bunch of special cases, then the generic case... + * Note that 'cmd' is already filtered in dev_ioctl() with + * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ + switch(cmd) + { + case SIOCGIWSTATS: + /* Get Wireless Stats */ + return dev_iwstats(dev, ifr); + + case SIOCGIWPRIV: + /* Check if we have some wireless handlers defined */ + if(dev->wireless_handlers != NULL) { + /* We export to user space the definition of + * the private handler ourselves */ + return ioctl_export_private(dev, ifr); + } + // ## Fall-through for old API ## + default: + /* Generic IOCTL */ + /* Basic check */ + if (!netif_device_present(dev)) + return -ENODEV; + /* New driver API : try to find the handler */ + handler = get_handler(dev, cmd); + if(handler != NULL) { + /* Standard and private are not the same */ + if(cmd < SIOCIWFIRSTPRIV) + return ioctl_standard_call(dev, + ifr, + cmd, + handler); + else + return ioctl_private_call(dev, + ifr, + cmd, + handler); + } + /* Old driver API : call driver ioctl handler */ + if (dev->do_ioctl) { + return dev->do_ioctl(dev, ifr, cmd); + } + return -EOPNOTSUPP; + } + /* Not reached */ + return -EINVAL; +} + +/************************* EVENT PROCESSING *************************/ +/* + * Process events generated by the wireless layer or the driver. + * Most often, the event will be propagated through rtnetlink + */ + +#ifdef WE_EVENT_NETLINK +/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here. + * It is declared in <linux/rtnetlink.h> */ + +/* ---------------------------------------------------------------- */ +/* + * Fill a rtnetlink message with our event data. + * Note that we propage only the specified event and don't dump the + * current wireless config. Dumping the wireless config is far too + * expensive (for each parameter, the driver need to query the hardware). + */ +static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb, + struct net_device * dev, + int type, + char * event, + int event_len) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = 0; /* Wireless changes don't affect those flags */ + + /* Add the wireless events in the netlink packet */ + RTA_PUT(skb, IFLA_WIRELESS, + event_len, event); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +/* ---------------------------------------------------------------- */ +/* + * Create and broadcast and send it on the standard rtnetlink socket + * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c + * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field + * within a RTM_NEWLINK event. + */ +static inline void rtmsg_iwinfo(struct net_device * dev, + char * event, + int event_len) +{ + struct sk_buff *skb; + int size = NLMSG_GOODSIZE; + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK, + event, event_len) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); +} +#endif /* WE_EVENT_NETLINK */ + +/* ---------------------------------------------------------------- */ +/* + * Main event dispatcher. Called from other parts and drivers. + * Send the event on the appropriate channels. + * May be called from interrupt context. + */ +void wireless_send_event(struct net_device * dev, + unsigned int cmd, + union iwreq_data * wrqu, + char * extra) +{ + const struct iw_ioctl_description * descr = NULL; + int extra_len = 0; + struct iw_event *event; /* Mallocated whole event */ + int event_len; /* Its size */ + int hdr_len; /* Size of the event header */ + /* Don't "optimise" the following variable, it will crash */ + unsigned cmd_index; /* *MUST* be unsigned */ + + /* Get the description of the IOCTL */ + if(cmd <= SIOCIWLAST) { + cmd_index = cmd - SIOCIWFIRST; + if(cmd_index < standard_ioctl_num) + descr = &(standard_ioctl[cmd_index]); + } else { + cmd_index = cmd - IWEVFIRST; + if(cmd_index < standard_event_num) + descr = &(standard_event[cmd_index]); + } + /* Don't accept unknown events */ + if(descr == NULL) { + /* Note : we don't return an error to the driver, because + * the driver would not know what to do about it. It can't + * return an error to the user, because the event is not + * initiated by a user request. + * The best the driver could do is to log an error message. + * We will do it ourselves instead... + */ + printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n", + dev->name, cmd); + return; + } +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n", + dev->name, cmd); + printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif /* WE_EVENT_DEBUG */ + + /* Check extra parameters and set extra_len */ + if(descr->header_type == IW_HEADER_TYPE_POINT) { + /* Check if number of token fits within bounds */ + if(wrqu->data.length > descr->max_tokens) { + printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length); + return; + } + if(wrqu->data.length < descr->min_tokens) { + printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length); + return; + } + /* Calculate extra_len - extra is NULL for restricted events */ + if(extra != NULL) + extra_len = wrqu->data.length * descr->token_size; +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len); +#endif /* WE_EVENT_DEBUG */ + } + + /* Total length of the event */ + hdr_len = event_type_size[descr->header_type]; + event_len = hdr_len + extra_len; + +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len); +#endif /* WE_EVENT_DEBUG */ + + /* Create temporary buffer to hold the event */ + event = kmalloc(event_len, GFP_ATOMIC); + if(event == NULL) + return; + + /* Fill event */ + event->len = event_len; + event->cmd = cmd; + memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN); + if(extra != NULL) + memcpy(((char *) event) + hdr_len, extra, extra_len); + +#ifdef WE_EVENT_NETLINK + /* rtnetlink event channel */ + rtmsg_iwinfo(dev, (char *) event, event_len); +#endif /* WE_EVENT_NETLINK */ + + /* Cleanup */ + kfree(event); + + return; /* Always success, I guess ;-) */ +} + +/********************** ENHANCED IWSPY SUPPORT **********************/ +/* + * In the old days, the driver was handling spy support all by itself. + * Now, the driver can delegate this task to Wireless Extensions. + * It needs to use those standard spy iw_handler in struct iw_handler_def, + * push data to us via wireless_spy_update() and include struct iw_spy_data + * in its private part (and advertise it in iw_handler_def->spy_offset). + * One of the main advantage of centralising spy support here is that + * it becomes much easier to improve and extend it without having to touch + * the drivers. One example is the addition of the Spy-Threshold events. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the pointer to the spy data in the driver. + * Because this is called on the Rx path via wireless_spy_update(), + * we want it to be efficient... + */ +static inline struct iw_spy_data * get_spydata(struct net_device *dev) +{ + /* This is the new way */ + if(dev->wireless_data) + return(dev->wireless_data->spy_data); + + /* This is the old way. Doesn't work for multi-headed drivers. + * It will be removed in the next version of WE. */ + return (dev->priv + dev->wireless_handlers->spy_offset); +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set Spy List + */ +int iw_handler_set_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + + if(!dev->wireless_data) + /* Help user know that driver needs updating */ + printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n", + dev->name); + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Disable spy collection while we copy the addresses. + * While we copy addresses, any call to wireless_spy_update() + * will NOP. This is OK, as anyway the addresses are changing. */ + spydata->spy_number = 0; + + /* We want to operate without locking, because wireless_spy_update() + * most likely will happen in the interrupt handler, and therefore + * have its own locking constraints and needs performance. + * The rtnl_lock() make sure we don't race with the other iw_handlers. + * This make sure wireless_spy_update() "see" that the spy list + * is temporarily disabled. */ + wmb(); + + /* Are there are addresses to copy? */ + if(wrqu->data.length > 0) { + int i; + + /* Copy addresses */ + for(i = 0; i < wrqu->data.length; i++) + memcpy(spydata->spy_address[i], address[i].sa_data, + ETH_ALEN); + /* Reset stats */ + memset(spydata->spy_stat, 0, + sizeof(struct iw_quality) * IW_MAX_SPY); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_handler_set_spy() : offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length); + for (i = 0; i < wrqu->data.length; i++) + printk(KERN_DEBUG + "%02X:%02X:%02X:%02X:%02X:%02X \n", + spydata->spy_address[i][0], + spydata->spy_address[i][1], + spydata->spy_address[i][2], + spydata->spy_address[i][3], + spydata->spy_address[i][4], + spydata->spy_address[i][5]); +#endif /* WE_SPY_DEBUG */ + } + + /* Make sure above is updated before re-enabling */ + wmb(); + + /* Enable addresses */ + spydata->spy_number = wrqu->data.length; + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get Spy List + */ +int iw_handler_get_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + int i; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + wrqu->data.length = spydata->spy_number; + + /* Copy addresses. */ + for(i = 0; i < spydata->spy_number; i++) { + memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); + address[i].sa_family = AF_UNIX; + } + /* Copy stats to the user buffer (just after). */ + if(spydata->spy_number > 0) + memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), + spydata->spy_stat, + sizeof(struct iw_quality) * spydata->spy_number); + /* Reset updated flags. */ + for(i = 0; i < spydata->spy_number; i++) + spydata->spy_stat[i].updated = 0; + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int iw_handler_set_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(spydata->spy_thr_low), &(threshold->low), + 2 * sizeof(struct iw_quality)); + + /* Clear flag */ + memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level); +#endif /* WE_SPY_DEBUG */ + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int iw_handler_get_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(threshold->low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device * dev, + struct iw_spy_data * spydata, + unsigned char * address, + struct iw_quality * wstats) +{ + union iwreq_data wrqu; + struct iw_thrspy threshold; + + /* Init */ + wrqu.data.length = 1; + wrqu.data.flags = 0; + /* Copy address */ + memcpy(threshold.addr.sa_data, address, ETH_ALEN); + threshold.addr.sa_family = ARPHRD_ETHER; + /* Copy stats */ + memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + /* Copy also thresholds */ + memcpy(&(threshold.low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n", + threshold.addr.sa_data[0], + threshold.addr.sa_data[1], + threshold.addr.sa_data[2], + threshold.addr.sa_data[3], + threshold.addr.sa_data[4], + threshold.addr.sa_data[5], threshold.qual.level); +#endif /* WE_SPY_DEBUG */ + + /* Send event to user space */ + wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void wireless_spy_update(struct net_device * dev, + unsigned char * address, + struct iw_quality * wstats) +{ + struct iw_spy_data * spydata = get_spydata(dev); + int i; + int match = -1; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return; + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "wireless_spy_update() : offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]); +#endif /* WE_SPY_DEBUG */ + + /* Update all records that match */ + for(i = 0; i < spydata->spy_number; i++) + if(!memcmp(address, spydata->spy_address[i], ETH_ALEN)) { + memcpy(&(spydata->spy_stat[i]), wstats, + sizeof(struct iw_quality)); + match = i; + } + + /* Generate an event if we cross the spy threshold. + * To avoid event storms, we have a simple hysteresis : we generate + * event only when we go under the low threshold or above the + * high threshold. */ + if(match >= 0) { + if(spydata->spy_thr_under[match]) { + if(wstats->level > spydata->spy_thr_high.level) { + spydata->spy_thr_under[match] = 0; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } else { + if(wstats->level < spydata->spy_thr_low.level) { + spydata->spy_thr_under[match] = 1; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } + } +} + +EXPORT_SYMBOL(iw_handler_get_spy); +EXPORT_SYMBOL(iw_handler_get_thrspy); +EXPORT_SYMBOL(iw_handler_set_spy); +EXPORT_SYMBOL(iw_handler_set_thrspy); +EXPORT_SYMBOL(wireless_send_event); +EXPORT_SYMBOL(wireless_spy_update); |