diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-04 16:27:41 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-04 16:27:41 -0800 |
commit | d347da0deffa1d8f88f0d270eab040e4707c9916 (patch) | |
tree | e0911f2ef4d36a7b44f7a5379feabebbd37dcfc4 | |
parent | c6c88bbde4d8b2ffe9886b7130b2e23781d424e5 (diff) | |
parent | 74cb8798222bb7d1aecb0acb91e6eeedf5feb948 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
263 files changed, 6860 insertions, 2857 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ebc09a159f62..2b7cf19a06ad 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER for the hash secret) for IP fragments. Default: 600 +ipfrag_max_dist - INTEGER + ipfrag_max_dist is a non-negative integer value which defines the + maximum "disorder" which is allowed among fragments which share a + common IP source address. Note that reordering of packets is + not unusual, but if a large number of fragments arrive from a source + IP address while a particular fragment queue remains incomplete, it + probably indicates that one or more fragments belonging to that queue + have been lost. When ipfrag_max_dist is positive, an additional check + is done on fragments before they are added to a reassembly queue - if + ipfrag_max_dist (or more) fragments have arrived from a particular IP + address between additions to any IP fragment queue using that source + address, it's presumed that one or more fragments in the queue are + lost. The existing fragment queue will be dropped, and a new one + started. An ipfrag_max_dist value of zero disables this check. + + Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can + result in unnecessarily dropping fragment queues when normal + reordering of packets occurs, which could lead to poor application + performance. Using a very large value, e.g. 50000, increases the + likelihood of incorrectly reassembling IP fragments that originate + from different IP datagrams, which could result in data corruption. + Default: 64 + INET peer storage: inet_peer_threshold - INTEGER diff --git a/drivers/char/random.c b/drivers/char/random.c index 7999da25fe40..bdfdfd28594d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, EXPORT_SYMBOL(secure_tcp_sequence_number); - - -/* Generate secure starting point for ephemeral TCP port search */ -u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) +/* Generate secure starting point for ephemeral IPV4 transport port search */ +u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[4]; @@ -1575,7 +1573,7 @@ u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) +u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[12]; @@ -1586,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp return twothirdsMD4Transform(daddr, hash); } -EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 475d98fa9e26..780009c7eaa6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -47,6 +47,8 @@ #include <linux/ip.h> #include <linux/in.h> +#include <net/dst.h> + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ef3ee035bbc8..ed0c2ead8bc1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -43,6 +43,8 @@ #include <linux/delay.h> #include <linux/completion.h> +#include <net/dst.h> + #include "ipoib.h" #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c index f857ae94d261..b0c3b6ab6263 100644 --- a/drivers/net/ns83820.c +++ b/drivers/net/ns83820.c @@ -115,6 +115,7 @@ #include <linux/ethtool.h> #include <linux/timer.h> #include <linux/if_vlan.h> +#include <linux/rtnetlink.h> #include <asm/io.h> #include <asm/uaccess.h> diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index a842ecc60a34..9369f811075d 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb); static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); -static struct proto_ops pppoe_ops; +static const struct proto_ops pppoe_ops; static DEFINE_RWLOCK(pppoe_hash_lock); static struct ppp_channel_ops pppoe_chan_ops; @@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb, { struct pppoe_hdr *ph; struct pppox_sock *po; - struct sock *sk; - int ret; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; @@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb, ph = (struct pppoe_hdr *) skb->nh.raw; po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source); - if (!po) - goto drop; - - sk = sk_pppox(po); - bh_lock_sock(sk); - - /* Socket state is unknown, must put skb into backlog. */ - if (sock_owned_by_user(sk) != 0) { - sk_add_backlog(sk, skb); - ret = NET_RX_SUCCESS; - } else { - ret = pppoe_rcv_core(sk, skb); - } - - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + if (po != NULL) + return sk_receive_skb(sk_pppox(po), skb); drop: kfree_skb(skb); out: @@ -1081,9 +1063,7 @@ static int __init pppoe_proc_init(void) static inline int pppoe_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ -/* ->ioctl are set at pppox_create */ - -static struct proto_ops pppoe_ops = { +static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, @@ -1099,7 +1079,8 @@ static struct proto_ops pppoe_ops = { .getsockopt = sock_no_getsockopt, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, - .mmap = sock_no_mmap + .mmap = sock_no_mmap, + .ioctl = pppox_ioctl, }; static struct pppox_proto pppoe_proto = { diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c index 0c1e114527fb..9315046b3f55 100644 --- a/drivers/net/pppox.c +++ b/drivers/net/pppox.c @@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto); EXPORT_SYMBOL(unregister_pppox_proto); EXPORT_SYMBOL(pppox_unbind_sock); -static int pppox_ioctl(struct socket* sock, unsigned int cmd, - unsigned long arg) +int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); @@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd, return rc; } +EXPORT_SYMBOL(pppox_ioctl); static int pppox_create(struct socket *sock, int protocol) { @@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol) goto out; rc = pppox_protos[protocol]->create(sock); - if (!rc) { - /* We get to set the ioctl handler. */ - /* For everything else, pppox is just a shell. */ - sock->ops->ioctl = pppox_ioctl; - } + module_put(pppox_protos[protocol]->owner); out: return rc; diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c index ae7343934758..e1a2d52cc1fe 100644 --- a/drivers/net/sk98lin/skge.c +++ b/drivers/net/sk98lin/skge.c @@ -107,6 +107,7 @@ #include "h/skversion.h" +#include <linux/in.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 00d683063c01..d8cc3aea032a 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -25,6 +25,7 @@ */ #include <linux/config.h> +#include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleparam.h> diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 2fc9893d69e1..eb86b059809b 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -24,6 +24,7 @@ #include <linux/compiler.h> #include <linux/slab.h> #include <linux/delay.h> +#include <linux/in.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/pci.h> @@ -3650,7 +3651,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) TXD_FLAG_CPU_POST_DMA); skb->nh.iph->check = 0; - skb->nh.iph->tot_len = ntohs(mss + ip_tcp_len + tcp_opt_len); + skb->nh.iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len); if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) { skb->h.th->check = 0; base_flags &= ~TXD_FLAG_TCPUDP_CSUM; diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c index 5e7c7e944c9d..64f6d1f25753 100644 --- a/drivers/net/wireless/ipw2200.c +++ b/drivers/net/wireless/ipw2200.c @@ -7456,8 +7456,7 @@ static void ipw_handle_data_packet(struct ipw_priv *priv, /* HW decrypt will not clear the WEP bit, MIC, PN, etc. */ hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data; if (priv->ieee->iw_mode != IW_MODE_MONITOR && - ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr1)) ? + (is_multicast_ether_addr(hdr->addr1) ? !priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt)) ipw_rebuild_decrypted_skb(priv, rxb->skb); @@ -7648,8 +7647,7 @@ static inline int is_network_packet(struct ipw_priv *priv, return 0; /* {broad,multi}cast packets to our BSSID go through */ - if (is_multicast_ether_addr(header->addr1) || - is_broadcast_ether_addr(header->addr1)) + if (is_multicast_ether_addr(header->addr1)) return !memcmp(header->addr3, priv->bssid, ETH_ALEN); /* packets to our adapter go through */ @@ -7662,8 +7660,7 @@ static inline int is_network_packet(struct ipw_priv *priv, return 0; /* {broad,multi}cast packets to our BSS go through */ - if (is_multicast_ether_addr(header->addr1) || - is_broadcast_ether_addr(header->addr1)) + if (is_multicast_ether_addr(header->addr1)) return !memcmp(header->addr2, priv->bssid, ETH_ALEN); /* packets to our adapter go through */ @@ -9657,8 +9654,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb, switch (priv->ieee->iw_mode) { case IW_MODE_ADHOC: hdr_len = IEEE80211_3ADDR_LEN; - unicast = !(is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr1)); + unicast = !is_multicast_ether_addr(hdr->addr1); id = ipw_find_station(priv, hdr->addr1); if (id == IPW_INVALID_STATION) { id = ipw_add_station(priv, hdr->addr1); @@ -9673,8 +9669,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb, case IW_MODE_INFRA: default: - unicast = !(is_multicast_ether_addr(hdr->addr3) || - is_broadcast_ether_addr(hdr->addr3)); + unicast = !is_multicast_ether_addr(hdr->addr3); hdr_len = IEEE80211_3ADDR_LEN; id = 0; break; diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c index a93c2bf94c33..6a9a75d40f73 100644 --- a/fs/9p/trans_sock.c +++ b/fs/9p/trans_sock.c @@ -26,6 +26,7 @@ */ #include <linux/config.h> +#include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f2ca782aba33..30cae3602867 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,9 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> + +#include <net/inet_sock.h> + #include "nfs4_fs.h" #include "callback.h" diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h index 578ed3f1a607..302201f1a097 100644 --- a/include/asm-alpha/bitops.h +++ b/include/asm-alpha/bitops.h @@ -321,6 +321,7 @@ static inline int fls(int word) #else #define fls generic_fls #endif +#define fls64 generic_fls64 /* Compute powers of two for the given integer. */ static inline long floor_log2(unsigned long word) diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h index 7399d431edfe..d02de721ecc1 100644 --- a/include/asm-arm/bitops.h +++ b/include/asm-arm/bitops.h @@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as @@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word) #define fls(x) \ ( __builtin_constant_p(x) ? generic_fls(x) : \ ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) ) +#define fls64(x) generic_fls64(x) #define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) #define __ffs(x) (ffs(x) - 1) #define ffz(x) __ffs( ~(x) ) diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h index 7d062fb2e343..15cc6f2da792 100644 --- a/include/asm-arm26/bitops.h +++ b/include/asm-arm26/bitops.h @@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h index 1bddb3f3a289..d3eb0f1e4208 100644 --- a/include/asm-cris/bitops.h +++ b/include/asm-cris/bitops.h @@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN - returns the hamming weight of a N-bit word diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h index b664bd5b6663..02be7b3a8a83 100644 --- a/include/asm-frv/bitops.h +++ b/include/asm-frv/bitops.h @@ -228,6 +228,7 @@ found_middle: \ bit ? 33 - bit : bit; \ }) +#define fls64(x) generic_fls64(x) /* * Every architecture must define this function. It's the fastest diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index ce31b739fd80..0e6d9852008c 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h index 5036f595f8c9..c0411ec9d651 100644 --- a/include/asm-h8300/bitops.h +++ b/include/asm-h8300/bitops.h @@ -406,5 +406,6 @@ found_middle: #endif /* __KERNEL__ */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* _H8300_BITOPS_H */ diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h index ddf1739dc7fd..4807aa1d2e3d 100644 --- a/include/asm-i386/bitops.h +++ b/include/asm-i386/bitops.h @@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h index 7232528e2d0c..36d0fb95ea89 100644 --- a/include/asm-ia64/bitops.h +++ b/include/asm-ia64/bitops.h @@ -345,6 +345,7 @@ fls (int t) x |= x >> 16; return ia64_popcnt(x); } +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as the libc and compiler builtin diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h index e78443981349..abea2fdd8689 100644 --- a/include/asm-m32r/bitops.h +++ b/include/asm-m32r/bitops.h @@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word) * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h index b1bcf7c66516..13f4c0048463 100644 --- a/include/asm-m68k/bitops.h +++ b/include/asm-m68k/bitops.h @@ -310,6 +310,7 @@ static inline int fls(int x) return 32 - cnt; } +#define fls64(x) generic_fls64(x) /* * Every architecture must define this function. It's the fastest diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h index c42f88a9b9f9..4058dd086a02 100644 --- a/include/asm-m68knommu/bitops.h +++ b/include/asm-m68knommu/bitops.h @@ -499,5 +499,6 @@ found_middle: * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* _M68KNOMMU_BITOPS_H */ diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h index 5496f9064a6a..3b0c8aaf6e8b 100644 --- a/include/asm-mips/bitops.h +++ b/include/asm-mips/bitops.h @@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word) return flz(~word) + 1; } - +#define fls64(x) generic_fls64(x) /* * find_next_zero_bit - find the first zero bit in a memory region diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h index 55b98c67fd82..15d8c2b51584 100644 --- a/include/asm-parisc/bitops.h +++ b/include/asm-parisc/bitops.h @@ -263,6 +263,7 @@ static __inline__ int fls(int x) return ret; } +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h index 5727229b0444..1996eaa8aeae 100644 --- a/include/asm-powerpc/bitops.h +++ b/include/asm-powerpc/bitops.h @@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x) asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); return 32 - lz; } +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h index b07c578b22ea..61232760cc3b 100644 --- a/include/asm-s390/bitops.h +++ b/include/asm-s390/bitops.h @@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b) * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h index 5163d1ff2f1b..1c5260860045 100644 --- a/include/asm-sh/bitops.h +++ b/include/asm-sh/bitops.h @@ -470,6 +470,7 @@ found_middle: */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h index e1ff63e09227..ce9c3ad45fe0 100644 --- a/include/asm-sh64/bitops.h +++ b/include/asm-sh64/bitops.h @@ -510,6 +510,7 @@ found_middle: #define ffs(x) generic_ffs(x) #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h index bfbd795a0a80..41722b5e45ef 100644 --- a/include/asm-sparc/bitops.h +++ b/include/asm-sparc/bitops.h @@ -298,6 +298,7 @@ static inline int ffs(int x) * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h index 6388b8376c50..6efc0162fb09 100644 --- a/include/asm-sparc64/bitops.h +++ b/include/asm-sparc64/bitops.h @@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h index b91e799763fd..8955d2376ac8 100644 --- a/include/asm-v850/bitops.h +++ b/include/asm-v850/bitops.h @@ -276,6 +276,7 @@ found_middle: #define ffs(x) generic_ffs (x) #define fls(x) generic_fls (x) +#define fls64(x) generic_fls64(x) #define __ffs(x) ffs(x) diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 05a0d374404b..a4d5d0909453 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -340,6 +340,20 @@ static __inline__ unsigned long __ffs(unsigned long word) return word; } +/* + * __fls: find last bit set. + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static __inline__ unsigned long __fls(unsigned long word) +{ + __asm__("bsrq %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + #ifdef __KERNEL__ static inline int sched_find_first_bit(const unsigned long *b) @@ -370,6 +384,19 @@ static __inline__ int ffs(int x) } /** + * fls64 - find last bit set in 64 bit word + * @x: the word to search + * + * This is defined the same way as fls. + */ +static __inline__ int fls64(__u64 x) +{ + if (x == 0) + return 0; + return __fls(x) + 1; +} + +/** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h index e76ee889e21d..0a2065f1a372 100644 --- a/include/asm-xtensa/bitops.h +++ b/include/asm-xtensa/bitops.h @@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x) { return __cntlz(x); } +#define fls64(x) generic_fls64(x) static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 38c2fb7ebe09..6a2a19f14bb2 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x) */ #include <asm/bitops.h> + +static inline int generic_fls64(__u64 x) +{ + __u32 h = x >> 32; + if (h) + return fls(x) + 32; + return fls(x); +} + static __inline__ int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 71fab4311e92..088529f54965 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -192,10 +192,9 @@ enum { #include <linux/workqueue.h> #include <net/inet_connection_sock.h> +#include <net/inet_sock.h> #include <net/inet_timewait_sock.h> -#include <net/sock.h> #include <net/tcp_states.h> -#include <net/tcp.h> enum dccp_state { DCCP_OPEN = TCP_ESTABLISHED, @@ -408,8 +407,6 @@ struct dccp_ackvec; * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_timestamp_time - time of latest TIMESTAMP option * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) - * @dccps_pmtu_cookie - Last pmtu seen by socket * @dccps_packet_size - Set thru setsockopt * @dccps_role - Role of this sock, one of %dccp_role * @dccps_ndp_count - number of Non Data Packets since last data packet @@ -434,8 +431,6 @@ struct dccp_sock { __u32 dccps_timestamp_echo; __u32 dccps_packet_size; unsigned long dccps_ndp_count; - __u16 dccps_ext_header_len; - __u32 dccps_pmtu_cookie; __u32 dccps_mss_cache; struct dccp_options dccps_options; struct dccp_ackvec *dccps_hc_rx_ackvec; diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 5f49a30eb6f2..745c988359c0 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr) * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a multicast address. + * By definition the broadcast address is also a multicast address. */ static inline int is_multicast_ether_addr(const u8 *addr) { - return ((addr[0] != 0xff) && (0x01 & addr[0])); + return (0x01 & addr[0]); } /** diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index e677f73f13dd..4fab3d0a4bce 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -157,8 +157,7 @@ struct pppox_proto { extern int register_pppox_proto(int proto_num, struct pppox_proto *pp); extern void unregister_pppox_proto(int proto_num); extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ -extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd, - unsigned long arg); +extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); /* PPPoX socket states */ enum { diff --git a/include/linux/ip.h b/include/linux/ip.h index 33e8a19a1a0f..9e2eb9a602eb 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -16,6 +16,7 @@ */ #ifndef _LINUX_IP_H #define _LINUX_IP_H +#include <linux/types.h> #include <asm/byteorder.h> #define IPTOS_TOS_MASK 0x1E @@ -78,126 +79,6 @@ #define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ #define IPOPT_TS_PRESPEC 3 /* specified modules only */ -#ifdef __KERNEL__ -#include <linux/config.h> -#include <linux/types.h> -#include <net/request_sock.h> -#include <net/sock.h> -#include <linux/igmp.h> -#include <net/flow.h> - -struct ip_options { - __u32 faddr; /* Saved first hop address */ - unsigned char optlen; - unsigned char srr; - unsigned char rr; - unsigned char ts; - unsigned char is_setbyuser:1, /* Set by setsockopt? */ - is_data:1, /* Options in __data, rather than skb */ - is_strictroute:1, /* Strict source route */ - srr_is_hit:1, /* Packet destination addr was our one */ - is_changed:1, /* IP checksum more not valid */ - rr_needaddr:1, /* Need to record addr of outgoing dev */ - ts_needtime:1, /* Need to record timestamp */ - ts_needaddr:1; /* Need to record addr of outgoing dev */ - unsigned char router_alert; - unsigned char __pad1; - unsigned char __pad2; - unsigned char __data[0]; -}; - -#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) - -struct inet_request_sock { - struct request_sock req; - u32 loc_addr; - u32 rmt_addr; - u16 rmt_port; - u16 snd_wscale : 4, - rcv_wscale : 4, - tstamp_ok : 1, - sack_ok : 1, - wscale_ok : 1, - ecn_ok : 1, - acked : 1; - struct ip_options *opt; -}; - -static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) -{ - return (struct inet_request_sock *)sk; -} - -struct ipv6_pinfo; - -struct inet_sock { - /* sk and pinet6 has to be the first two members of inet_sock */ - struct sock sk; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct ipv6_pinfo *pinet6; -#endif - /* Socket demultiplex comparisons on incoming packets. */ - __u32 daddr; /* Foreign IPv4 addr */ - __u32 rcv_saddr; /* Bound local IPv4 addr */ - __u16 dport; /* Destination port */ - __u16 num; /* Local port */ - __u32 saddr; /* Sending source */ - __s16 uc_ttl; /* Unicast TTL */ - __u16 cmsg_flags; - struct ip_options *opt; - __u16 sport; /* Source port */ - __u16 id; /* ID counter for DF pkts */ - __u8 tos; /* TOS */ - __u8 mc_ttl; /* Multicasting TTL */ - __u8 pmtudisc; - unsigned recverr : 1, - freebind : 1, - hdrincl : 1, - mc_loop : 1; - int mc_index; /* Multicast device index */ - __u32 mc_addr; - struct ip_mc_socklist *mc_list; /* Group array */ - /* - * Following members are used to retain the infomation to build - * an ip header on each ip fragmentation while the socket is corked. - */ - struct { - unsigned int flags; - unsigned int fragsize; - struct ip_options *opt; - struct rtable *rt; - int length; /* Total length of all frames */ - u32 addr; - struct flowi fl; - } cork; -}; - -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ - -static inline struct inet_sock *inet_sk(const struct sock *sk) -{ - return (struct inet_sock *)sk; -} - -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} -#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); -} -#endif -#endif - -extern int inet_sk_rebuild_header(struct sock *sk); - struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e0b922785d98..93bbed5c6cf4 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -171,12 +171,13 @@ enum { }; #ifdef __KERNEL__ -#include <linux/in6.h> /* struct sockaddr_in6 */ #include <linux/icmpv6.h> -#include <net/if_inet6.h> /* struct ipv6_mc_socklist */ #include <linux/tcp.h> #include <linux/udp.h> +#include <net/if_inet6.h> /* struct ipv6_mc_socklist */ +#include <net/inet_sock.h> + /* This structure contains results of exthdrs parsing as offsets from skb->nh. @@ -199,18 +200,17 @@ static inline int inet6_iif(const struct sk_buff *skb) return IP6CB(skb)->iif; } -struct tcp6_request_sock { - struct tcp_request_sock req; +struct inet6_request_sock { struct in6_addr loc_addr; struct in6_addr rmt_addr; struct sk_buff *pktopts; int iif; }; -static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk) -{ - return (struct tcp6_request_sock *)sk; -} +struct tcp6_request_sock { + struct tcp_request_sock tcp6rsk_tcp; + struct inet6_request_sock tcp6rsk_inet6; +}; /** * struct ipv6_pinfo - ipv6 private area @@ -298,12 +298,36 @@ struct tcp6_sock { struct ipv6_pinfo inet6; }; +extern int inet6_sk_rebuild_header(struct sock *sk); + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { return inet_sk(__sk)->pinet6; } +static inline struct inet6_request_sock * + inet6_rsk(const struct request_sock *rsk) +{ + return (struct inet6_request_sock *)(((u8 *)rsk) + + inet_rsk(rsk)->inet6_rsk_offset); +} + +static inline u32 inet6_rsk_offset(struct request_sock *rsk) +{ + return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock); +} + +static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops) +{ + struct request_sock *req = reqsk_alloc(ops); + + if (req != NULL) + inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req); + + return req; +} + static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return (struct raw6_sock *)sk; @@ -323,28 +347,37 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) -#include <linux/tcp.h> +struct inet6_timewait_sock { + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; struct tcp6_timewait_sock { - struct tcp_timewait_sock tw_v6_sk; - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; + struct tcp_timewait_sock tcp6tw_tcp; + struct inet6_timewait_sock tcp6tw_inet6; }; -static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +static inline u16 inet6_tw_offset(const struct proto *prot) { - return (struct tcp6_timewait_sock *)sk; + return prot->twsk_prot->twsk_obj_size - + sizeof(struct inet6_timewait_sock); } -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) +{ + return (struct inet6_timewait_sock *)(((u8 *)sk) + + inet_twsk(sk)->tw_ipv6_offset); +} + +static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; + &inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr; } -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; + return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) @@ -361,13 +394,19 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) return NULL; } +static inline struct inet6_request_sock * + inet6_rsk(const struct request_sock *rsk) +{ + return NULL; +} + static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } -#define __tcp_v6_rcv_saddr(__sk) NULL -#define tcp_v6_rcv_saddr(__sk) NULL +#define __inet6_rcv_saddr(__sk) NULL +#define inet6_rcv_saddr(__sk) NULL #define tcp_twsk_ipv6only(__sk) 0 #define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ diff --git a/include/linux/net.h b/include/linux/net.h index d6a41e6577f6..28195a2d8ff0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -107,7 +107,7 @@ enum sock_type { struct socket { socket_state state; unsigned long flags; - struct proto_ops *ops; + const struct proto_ops *ops; struct fasync_struct *fasync_list; struct file *file; struct sock *sk; @@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \ (file, sock, vma)) \ \ -static struct proto_ops name##_ops = { \ +static const struct proto_ops name##_ops = { \ .family = fam, \ .owner = THIS_MODULE, \ .release = __lock_##name##_release, \ diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h index 724066778aff..6351c4055ace 100644 --- a/include/linux/pfkeyv2.h +++ b/include/linux/pfkeyv2.h @@ -216,6 +216,16 @@ struct sadb_x_nat_t_port { } __attribute__((packed)); /* sizeof(struct sadb_x_nat_t_port) == 8 */ +/* Generic LSM security context */ +struct sadb_x_sec_ctx { + uint16_t sadb_x_sec_len; + uint16_t sadb_x_sec_exttype; + uint8_t sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */ + uint8_t sadb_x_ctx_doi; + uint16_t sadb_x_ctx_len; +} __attribute__((packed)); +/* sizeof(struct sadb_sec_ctx) = 8 */ + /* Message types */ #define SADB_RESERVED 0 #define SADB_GETSPI 1 @@ -325,7 +335,8 @@ struct sadb_x_nat_t_port { #define SADB_X_EXT_NAT_T_SPORT 21 #define SADB_X_EXT_NAT_T_DPORT 22 #define SADB_X_EXT_NAT_T_OA 23 -#define SADB_EXT_MAX 23 +#define SADB_X_EXT_SEC_CTX 24 +#define SADB_EXT_MAX 24 /* Identity Extension values */ #define SADB_IDENTTYPE_RESERVED 0 diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index e87b233615b3..d10f35338507 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -429,6 +429,7 @@ enum TCA_NETEM_CORR, TCA_NETEM_DELAY_DIST, TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, __TCA_NETEM_MAX, }; @@ -457,6 +458,12 @@ struct tc_netem_reorder __u32 correlation; }; +struct tc_netem_corrupt +{ + __u32 probability; + __u32 correlation; +}; + #define NETEM_DIST_SCALE 8192 #endif diff --git a/include/linux/random.h b/include/linux/random.h index 7b2adb3322d5..5d6456bcdeba 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -52,9 +52,9 @@ extern void get_random_bytes(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); extern __u32 secure_ip_id(__u32 daddr); -extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); -extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, - __u16 dport); +extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); +extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, + __u16 dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, diff --git a/include/linux/security.h b/include/linux/security.h index f7e0ae018712..ef753654daa5 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -59,6 +59,12 @@ struct sk_buff; struct sock; struct sockaddr; struct socket; +struct flowi; +struct dst_entry; +struct xfrm_selector; +struct xfrm_policy; +struct xfrm_state; +struct xfrm_user_sec_ctx; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb); @@ -788,6 +794,52 @@ struct swap_info_struct; * which is used to copy security attributes between local stream sockets. * @sk_free_security: * Deallocate security structure. + * @sk_getsid: + * Retrieve the LSM-specific sid for the sock to enable caching of network + * authorizations. + * + * Security hooks for XFRM operations. + * + * @xfrm_policy_alloc_security: + * @xp contains the xfrm_policy being added to Security Policy Database + * used by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level policy update program (e.g., setkey). + * Allocate a security structure to the xp->selector.security field. + * The security field is initialized to NULL when the xfrm_policy is + * allocated. + * Return 0 if operation was successful (memory to allocate, legal context) + * @xfrm_policy_clone_security: + * @old contains an existing xfrm_policy in the SPD. + * @new contains a new xfrm_policy being cloned from old. + * Allocate a security structure to the new->selector.security field + * that contains the information from the old->selector.security field. + * Return 0 if operation was successful (memory to allocate). + * @xfrm_policy_free_security: + * @xp contains the xfrm_policy + * Deallocate xp->selector.security. + * @xfrm_state_alloc_security: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level SA generation program (e.g., setkey or racoon). + * Allocate a security structure to the x->sel.security field. The + * security field is initialized to NULL when the xfrm_state is + * allocated. + * Return 0 if operation was successful (memory to allocate, legal context). + * @xfrm_state_free_security: + * @x contains the xfrm_state. + * Deallocate x>sel.security. + * @xfrm_policy_lookup: + * @xp contains the xfrm_policy for which the access control is being + * checked. + * @sk_sid contains the sock security label that is used to authorize + * access to the policy xp. + * @dir contains the direction of the flow (input or output). + * Check permission when a sock selects a xfrm_policy for processing + * XFRMs on a packet. The hook is called when selecting either a + * per-socket policy or a generic xfrm policy. + * Return 0 if permission is granted. * * Security hooks affecting all Key Management operations * @@ -1237,8 +1289,18 @@ struct security_operations { int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); void (*sk_free_security) (struct sock *sk); + unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir); #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); + int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new); + void (*xfrm_policy_free_security) (struct xfrm_policy *xp); + int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); + void (*xfrm_state_free_security) (struct xfrm_state *x); + int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + /* key management security hooks */ #ifdef CONFIG_KEYS int (*key_alloc)(struct key *key); @@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk) { return security_ops->sk_free_security(sk); } + +static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return security_ops->sk_getsid(sk, fl, dir); +} #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct socket * sock, struct socket * other, @@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority) static inline void security_sk_free(struct sock *sk) { } + +static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return security_ops->xfrm_policy_alloc_security(xp, sec_ctx); +} + +static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return security_ops->xfrm_policy_clone_security(old, new); +} + +static inline void security_xfrm_policy_free(struct xfrm_policy *xp) +{ + security_ops->xfrm_policy_free_security(xp); +} + +static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return security_ops->xfrm_state_alloc_security(x, sec_ctx); +} + +static inline void security_xfrm_state_free(struct xfrm_state *x) +{ + security_ops->xfrm_state_free_security(x); +} + +static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return security_ops->xfrm_policy_lookup(xp, sk_sid, dir); +} +#else /* CONFIG_SECURITY_NETWORK_XFRM */ +static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return 0; +} + +static inline void security_xfrm_policy_free(struct xfrm_policy *xp) +{ +} + +static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline void security_xfrm_state_free(struct xfrm_state *x) +{ +} + +static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return 0; +} +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY static inline int security_key_alloc(struct key *key) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8c5d6001a923..483cfc47ec34 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -32,7 +32,6 @@ #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ -#define SLAB_SKB /* Slabified skbuffs */ #define CHECKSUM_NONE 0 #define CHECKSUM_HW 1 @@ -134,7 +133,7 @@ struct skb_frag_struct { */ struct skb_shared_info { atomic_t dataref; - unsigned int nr_frags; + unsigned short nr_frags; unsigned short tso_size; unsigned short tso_segs; unsigned short ufo_size; @@ -1239,6 +1238,8 @@ extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb); +extern void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, + unsigned int flags); extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum); extern int skb_copy_bits(const struct sk_buff *skb, int offset, diff --git a/include/linux/socket.h b/include/linux/socket.h index 1739c2d5b95b..9f4019156fd8 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage { #include <linux/compiler.h> /* __user */ extern int sysctl_somaxconn; -extern void sock_init(void); #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4be34ef8c2f7..93fa765e47d3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -390,6 +390,7 @@ enum NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, + NET_IPV4_IPFRAG_MAX_DIST=112, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0e1da6602e05..f2bb2396853f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -55,22 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; -#define TCP_ACTION_FIN (1 << 7) - -enum { - TCPF_ESTABLISHED = (1 << 1), - TCPF_SYN_SENT = (1 << 2), - TCPF_SYN_RECV = (1 << 3), - TCPF_FIN_WAIT1 = (1 << 4), - TCPF_FIN_WAIT2 = (1 << 5), - TCPF_TIME_WAIT = (1 << 6), - TCPF_CLOSE = (1 << 7), - TCPF_CLOSE_WAIT = (1 << 8), - TCPF_LAST_ACK = (1 << 9), - TCPF_LISTEN = (1 << 10), - TCPF_CLOSING = (1 << 11) -}; - /* * The union cast uses a gcc extension to avoid aliasing problems * (union is compatible to any of its members) @@ -254,10 +238,9 @@ struct tcp_sock { __u32 snd_wl1; /* Sequence for window update */ __u32 snd_wnd; /* The window we expect to receive */ __u32 max_window; /* Maximal window ever seen from peer */ - __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 xmit_size_goal; /* Goal for segmenting output packets */ - __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + /* XXX Two bytes hole, try to pack */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ @@ -295,8 +278,6 @@ struct tcp_sock { struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ - struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ - __u32 rcv_wnd; /* Current receiver window */ __u32 rcv_wup; /* rcv_nxt on last window update sent */ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/linux/udp.h b/include/linux/udp.h index b60e0b4a25c4..85a55658831c 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -35,10 +35,10 @@ struct udphdr { #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */ #ifdef __KERNEL__ - #include <linux/config.h> -#include <net/sock.h> -#include <linux/ip.h> +#include <linux/types.h> + +#include <net/inet_sock.h> struct udp_sock { /* inet_sock has to be the first member */ diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 0fb077d68441..82fbb758e28f 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -27,6 +27,22 @@ struct xfrm_id __u8 proto; }; +struct xfrm_sec_ctx { + __u8 ctx_doi; + __u8 ctx_alg; + __u16 ctx_len; + __u32 ctx_sid; + char ctx_str[0]; +}; + +/* Security Context Domains of Interpretation */ +#define XFRM_SC_DOI_RESERVED 0 +#define XFRM_SC_DOI_LSM 1 + +/* Security Context Algorithms */ +#define XFRM_SC_ALG_RESERVED 0 +#define XFRM_SC_ALG_SELINUX 1 + /* Selector, used as selector both on policy rules (SPD) and SAs. */ struct xfrm_selector @@ -146,6 +162,18 @@ enum { #define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE) +/* + * Generic LSM security context for comunicating to user space + * NOTE: Same format as sadb_x_sec_ctx + */ +struct xfrm_user_sec_ctx { + __u16 len; + __u16 exttype; + __u8 ctx_alg; /* LSMs: e.g., selinux == 1 */ + __u8 ctx_doi; + __u16 ctx_len; +}; + struct xfrm_user_tmpl { struct xfrm_id id; __u16 family; @@ -176,6 +204,7 @@ enum xfrm_attr_type_t { XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */ XFRMA_SA, XFRMA_POLICY, + XFRMA_SEC_CTX, /* struct xfrm_sec_ctx */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b5d785ab4a0e..bfc1779fc753 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -13,7 +13,7 @@ extern void unix_gc(void); #define UNIX_HASH_SIZE 256 extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -extern rwlock_t unix_table_lock; +extern spinlock_t unix_table_lock; extern atomic_t unix_tot_inflight; @@ -58,10 +58,10 @@ struct unix_skb_parms { #define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) -#define unix_state_rlock(s) read_lock(&unix_sk(s)->lock) -#define unix_state_runlock(s) read_unlock(&unix_sk(s)->lock) -#define unix_state_wlock(s) write_lock(&unix_sk(s)->lock) -#define unix_state_wunlock(s) write_unlock(&unix_sk(s)->lock) +#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock) +#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock) #ifdef __KERNEL__ /* The AF_UNIX socket */ @@ -76,7 +76,7 @@ struct unix_sock { struct sock *other; struct sock *gc_tree; atomic_t inflight; - rwlock_t lock; + spinlock_t lock; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/include/net/atmclip.h b/include/net/atmclip.h index 47048b1d179a..90fcc98e676f 100644 --- a/include/net/atmclip.h +++ b/include/net/atmclip.h @@ -7,7 +7,6 @@ #define _ATMCLIP_H #include <linux/netdevice.h> -#include <linux/skbuff.h> #include <linux/atm.h> #include <linux/atmdev.h> #include <linux/atmarp.h> @@ -18,6 +17,7 @@ #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back)) #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key) +struct sk_buff; struct clip_vcc { struct atm_vcc *vcc; /* VCC descriptor */ diff --git a/include/net/dst.h b/include/net/dst.h index 6c196a5baf24..bee8b84d329d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -9,6 +9,7 @@ #define _NET_DST_H #include <linux/config.h> +#include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/jiffies.h> diff --git a/include/net/flow.h b/include/net/flow.h index 9a5c94b1a0ec..ec7eb86eb203 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -84,11 +84,12 @@ struct flowi { #define FLOW_DIR_OUT 1 #define FLOW_DIR_FWD 2 -typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, +struct sock; +typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp); -extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver); +extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, + flow_resolve_t resolver); extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 52d8b1a73d52..c5b96b2b8155 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -60,7 +60,7 @@ struct genl_info */ struct genl_ops { - unsigned int cmd; + u8 cmd; unsigned int flags; struct nla_policy *policy; int (*doit)(struct sk_buff *skb, diff --git a/include/net/icmp.h b/include/net/icmp.h index 6cdebeee5f96..e7c3f20fbafc 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -20,12 +20,9 @@ #include <linux/config.h> #include <linux/icmp.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/protocol.h> +#include <net/inet_sock.h> #include <net/snmp.h> -#include <linux/ip.h> struct icmp_err { int errno; @@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); #define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) #define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) +struct dst_entry; +struct net_proto_family; +struct sk_buff; + extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info); extern int icmp_rcv(struct sk_buff *skb); extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg); diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h index 225fc751d464..03b766afdc39 100644 --- a/include/net/ieee80211_crypt.h +++ b/include/net/ieee80211_crypt.h @@ -23,12 +23,17 @@ #ifndef IEEE80211_CRYPT_H #define IEEE80211_CRYPT_H -#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/list.h> +#include <asm/atomic.h> enum { IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), }; +struct sk_buff; +struct module; + struct ieee80211_crypto_ops { const char *name; struct list_head list; @@ -87,6 +92,8 @@ struct ieee80211_crypt_data { atomic_t refcnt; }; +struct ieee80211_device; + int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops); int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops); struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name); diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h new file mode 100644 index 000000000000..b33b438bffcc --- /dev/null +++ b/include/net/inet6_connection_sock.h @@ -0,0 +1,42 @@ +/* + * NET Generic infrastructure for INET6 connection oriented protocols. + * + * Authors: Many people, see the TCPv6 sources + * + * From code originally in TCPv6 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET6_CONNECTION_SOCK_H +#define _INET6_CONNECTION_SOCK_H + +#include <linux/types.h> + +struct in6_addr; +struct inet_bind_bucket; +struct request_sock; +struct sk_buff; +struct sock; +struct sockaddr; + +extern int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); + +extern struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif); + +extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout); + +extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + +extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok); +#endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 5a2beed5a770..25f708ff020e 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk) return inet6_ehashfn(laddr, lport, faddr, fport); } +static inline void __inet6_hash(struct inet_hashinfo *hashinfo, + struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + unsigned int hash; + sk->sk_hash = hash = inet6_sk_ehashfn(sk); + hash &= (hashinfo->ehash_size - 1); + list = &hashinfo->ehash[hash].chain; + lock = &hashinfo->ehash[hash].lock; + write_lock(lock); + } + + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); +} + /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM @@ -84,10 +110,10 @@ static inline struct sock * if(*((__u32 *)&(tw->tw_dport)) == ports && sk->sk_family == PF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); - if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f943306ce5ff..227adcbdfec8 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -1,8 +1,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H -extern struct proto_ops inet_stream_ops; -extern struct proto_ops inet_dgram_ops; +extern const struct proto_ops inet_stream_ops; +extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b0c99060b78d..50234fa56a68 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -15,9 +15,11 @@ #ifndef _INET_CONNECTION_SOCK_H #define _INET_CONNECTION_SOCK_H -#include <linux/ip.h> +#include <linux/compiler.h> #include <linux/string.h> #include <linux/timer.h> + +#include <net/inet_sock.h> #include <net/request_sock.h> #define INET_CSK_DEBUG 1 @@ -29,6 +31,29 @@ struct inet_bind_bucket; struct inet_hashinfo; struct tcp_congestion_ops; +/* + * Pointers to address related TCP functions + * (i.e. things that depend on the address family) + */ +struct inet_connection_sock_af_ops { + int (*queue_xmit)(struct sk_buff *skb, int ipfragok); + void (*send_check)(struct sock *sk, int len, + struct sk_buff *skb); + int (*rebuild_header)(struct sock *sk); + int (*conn_request)(struct sock *sk, struct sk_buff *skb); + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); + int (*remember_stamp)(struct sock *sk); + __u16 net_header_len; + int (*setsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int optlen); + int (*getsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); + void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); + int sockaddr_len; +}; + /** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children @@ -36,13 +61,16 @@ struct tcp_congestion_ops; * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout + * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook + * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes + * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { @@ -54,14 +82,17 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_pmtu_cookie; struct tcp_congestion_ops *icsk_ca_ops; + struct inet_connection_sock_af_ops *icsk_af_ops; + unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; __u8 icsk_probes_out; - /* 2 BYTES HOLE, TRY TO PACK! */ + __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ @@ -192,8 +223,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk, const __u16 rport, const __u32 raddr, const __u32 laddr); +extern int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum); + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)); extern struct dst_entry* inet_csk_route_req(struct sock *sk, const struct request_sock *req); @@ -207,7 +242,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout); + unsigned long timeout); static inline void inet_csk_reqsk_queue_removed(struct sock *sk, struct request_sock *req) @@ -273,4 +308,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk) extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); extern void inet_csk_listen_stop(struct sock *sk); +extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index b0c47e2eccf1..d599c6bfbb86 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -3,6 +3,8 @@ #include <linux/ip.h> #include <linux/skbuff.h> + +#include <net/inet_sock.h> #include <net/dsfield.h> enum { diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 07840baa9341..135d80fd658e 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <net/inet_connection_sock.h> +#include <net/inet_sock.h> #include <net/route.h> #include <net/sock.h> #include <net/tcp_states.h> @@ -128,26 +129,6 @@ struct inet_hashinfo { kmem_cache_t *bind_bucket_cachep; }; -static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, - const __u32 faddr, const __u16 fport) -{ - unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h; -} - -static inline int inet_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const __u32 laddr = inet->rcv_saddr; - const __u16 lport = inet->num; - const __u32 faddr = inet->daddr; - const __u16 fport = inet->dport; - - return inet_ehashfn(laddr, lport, faddr, fport); -} - static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) @@ -434,4 +415,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, return sk; } + +extern int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h new file mode 100644 index 000000000000..883eb529ef8e --- /dev/null +++ b/include/net/inet_sock.h @@ -0,0 +1,193 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for inet_sock + * + * Authors: Many, reorganised here by + * Arnaldo Carvalho de Melo <acme@mandriva.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_SOCK_H +#define _INET_SOCK_H + +#include <linux/config.h> + +#include <linux/string.h> +#include <linux/types.h> + +#include <net/flow.h> +#include <net/sock.h> +#include <net/request_sock.h> + +/** struct ip_options - IP Options + * + * @faddr - Saved first hop address + * @is_setbyuser - Set by setsockopt? + * @is_data - Options in __data, rather than skb + * @is_strictroute - Strict source route + * @srr_is_hit - Packet destination addr was our one + * @is_changed - IP checksum more not valid + * @rr_needaddr - Need to record addr of outgoing dev + * @ts_needtime - Need to record timestamp + * @ts_needaddr - Need to record addr of outgoing dev + */ +struct ip_options { + __u32 faddr; + unsigned char optlen; + unsigned char srr; + unsigned char rr; + unsigned char ts; + unsigned char is_setbyuser:1, + is_data:1, + is_strictroute:1, + srr_is_hit:1, + is_changed:1, + rr_needaddr:1, + ts_needtime:1, + ts_needaddr:1; + unsigned char router_alert; + unsigned char __pad1; + unsigned char __pad2; + unsigned char __data[0]; +}; + +#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) + +struct inet_request_sock { + struct request_sock req; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u16 inet6_rsk_offset; + /* 2 bytes hole, try to pack */ +#endif + u32 loc_addr; + u32 rmt_addr; + u16 rmt_port; + u16 snd_wscale : 4, + rcv_wscale : 4, + tstamp_ok : 1, + sack_ok : 1, + wscale_ok : 1, + ecn_ok : 1, + acked : 1; + struct ip_options *opt; +}; + +static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) +{ + return (struct inet_request_sock *)sk; +} + +struct ip_mc_socklist; +struct ipv6_pinfo; +struct rtable; + +/** struct inet_sock - representation of INET sockets + * + * @sk - ancestor class + * @pinet6 - pointer to IPv6 control block + * @daddr - Foreign IPv4 addr + * @rcv_saddr - Bound local IPv4 addr + * @dport - Destination port + * @num - Local port + * @saddr - Sending source + * @uc_ttl - Unicast TTL + * @sport - Source port + * @id - ID counter for DF pkts + * @tos - TOS + * @mc_ttl - Multicasting TTL + * @is_icsk - is this an inet_connection_sock? + * @mc_index - Multicast device index + * @mc_list - Group array + * @cork - info to build ip hdr on each ip frag while socket is corked + */ +struct inet_sock { + /* sk and pinet6 has to be the first two members of inet_sock */ + struct sock sk; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipv6_pinfo *pinet6; +#endif + /* Socket demultiplex comparisons on incoming packets. */ + __u32 daddr; + __u32 rcv_saddr; + __u16 dport; + __u16 num; + __u32 saddr; + __s16 uc_ttl; + __u16 cmsg_flags; + struct ip_options *opt; + __u16 sport; + __u16 id; + __u8 tos; + __u8 mc_ttl; + __u8 pmtudisc; + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1; + int mc_index; + __u32 mc_addr; + struct ip_mc_socklist *mc_list; + struct { + unsigned int flags; + unsigned int fragsize; + struct ip_options *opt; + struct rtable *rt; + int length; /* Total length of all frames */ + u32 addr; + struct flowi fl; + } cork; +}; + +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ + +static inline struct inet_sock *inet_sk(const struct sock *sk) +{ + return (struct inet_sock *)sk; +} + +static inline void __inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from, + const int ancestor_size) +{ + memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, + sk_from->sk_prot->obj_size - ancestor_size); +} +#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) +static inline void inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); +} +#endif + +extern int inet_sk_rebuild_header(struct sock *sk); + +static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport) +{ + unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h; +} + +static inline int inet_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport); +} + +#endif /* _INET_SOCK_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 28f7b2103505..1da294c47522 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -17,15 +17,16 @@ #include <linux/config.h> -#include <linux/ip.h> #include <linux/list.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <net/inet_sock.h> #include <net/sock.h> #include <net/tcp_states.h> +#include <net/timewait_sock.h> #include <asm/atomic.h> @@ -127,7 +128,8 @@ struct inet_timewait_sock { __u16 tw_num; /* And these are ours. */ __u8 tw_ipv6only:1; - /* 31 bits hole, try to pack */ + /* 15 bits hole, try to pack */ + __u16 tw_ipv6_offset; int tw_timeout; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; @@ -199,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - kmem_cache_free(tw->tw_prot->twsk_slab, tw); + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } } diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 7fda471002b6..0965515f40cf 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -25,6 +25,7 @@ struct inet_peer __u32 v4daddr; /* peer's address */ __u16 avl_height; __u16 ip_id_count; /* IP ID for the next packet */ + atomic_t rid; /* Frag reception counter */ __u32 tcp_ts; unsigned long tcp_ts_stamp; }; diff --git a/include/net/ip.h b/include/net/ip.h index e4563bbee6ea..f7e7fd728b67 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -24,14 +24,10 @@ #include <linux/config.h> #include <linux/types.h> -#include <linux/socket.h> #include <linux/ip.h> #include <linux/in.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/in_route.h> -#include <net/route.h> -#include <net/arp.h> + +#include <net/inet_sock.h> #include <net/snmp.h> struct sock; @@ -45,6 +41,7 @@ struct inet_skb_parm #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_FRAG_COMPLETE 16 }; struct ipcm_cookie @@ -74,6 +71,13 @@ extern rwlock_t ip_ra_lock; #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ +struct msghdr; +struct net_device; +struct packet_type; +struct rtable; +struct sk_buff; +struct sockaddr; + extern void ip_mc_dropsocket(struct sock *); extern void ip_mc_dropdevice(struct net_device *dev); extern int igmp_mc_proc_init(void); @@ -168,6 +172,7 @@ extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_secret_interval; +extern int sysctl_ipfrag_max_dist; /* From inetpeer.c */ extern int inet_peer_threshold; @@ -182,6 +187,8 @@ extern int sysctl_ip_dynaddr; extern void ipfrag_init(void); #ifdef CONFIG_INET +#include <net/dst.h> + /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 14de4ebd1211..e000fa2cd5f6 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag); extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); +struct rtentry; + /* Exported by fib_semantics.c */ extern int ip_fib_check_default(u32 gw, struct net_device *dev); extern int fib_sync_down(u32 local, struct net_device *dev, int force); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3b5559a023a4..7d2674fde19a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -251,16 +251,15 @@ struct ip_vs_daemon_user { #include <linux/config.h> #include <linux/list.h> /* for struct list_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ -#include <linux/skbuff.h> /* for struct sk_buff */ -#include <linux/ip.h> /* for struct iphdr */ #include <asm/atomic.h> /* for struct atomic_t */ -#include <linux/netdevice.h> /* for struct neighbour */ -#include <net/dst.h> /* for struct dst_entry */ -#include <net/udp.h> #include <linux/compiler.h> +#include <linux/timer.h> +#include <net/checksum.h> #ifdef CONFIG_IP_VS_DEBUG +#include <linux/net.h> + extern int ip_vs_get_debug_level(void); #define IP_VS_DBG(level, msg...) \ do { \ @@ -429,8 +428,11 @@ struct ip_vs_stats spinlock_t lock; /* spin lock */ }; +struct dst_entry; +struct iphdr; struct ip_vs_conn; struct ip_vs_app; +struct sk_buff; struct ip_vs_protocol { struct ip_vs_protocol *next; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0a2ad51cff82..860bbac4c4ee 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -240,6 +240,8 @@ extern struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_t struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); +extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb); + extern int ip6_frag_nqueues; extern atomic_t ip6_frag_mem; @@ -525,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr, extern int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +extern int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + /* * reassembly.c */ @@ -533,8 +538,11 @@ extern int sysctl_ip6frag_low_thresh; extern int sysctl_ip6frag_time; extern int sysctl_ip6frag_secret_interval; -extern struct proto_ops inet6_stream_ops; -extern struct proto_ops inet6_dgram_ops; +extern const struct proto_ops inet6_stream_ops; +extern const struct proto_ops inet6_dgram_ops; + +struct group_source_req; +struct group_filter; extern int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index f85d6e4b7442..bbac87eeb422 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -35,11 +35,20 @@ enum { #ifdef __KERNEL__ -#include <linux/skbuff.h> -#include <linux/netdevice.h> +#include <linux/config.h> +#include <linux/compiler.h> #include <linux/icmpv6.h> +#include <linux/in6.h> +#include <linux/types.h> + #include <net/neighbour.h> -#include <asm/atomic.h> + +struct ctl_table; +struct file; +struct inet6_dev; +struct net_device; +struct net_proto_family; +struct sk_buff; extern struct neigh_table nd_tbl; @@ -108,7 +117,7 @@ extern int igmp6_event_report(struct sk_buff *skb); extern void igmp6_cleanup(void); #ifdef CONFIG_SYSCTL -extern int ndisc_ifinfo_sysctl_change(ctl_table *ctl, +extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 34c07731933d..6fa9ae190741 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -49,8 +49,8 @@ #ifdef __KERNEL__ #include <asm/atomic.h> -#include <linux/skbuff.h> #include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h index bd08964b72c0..b225d8472b7e 100644 --- a/include/net/pkt_act.h +++ b/include/net/pkt_act.h @@ -15,7 +15,6 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> diff --git a/include/net/protocol.h b/include/net/protocol.h index 357691f6a45f..63f7db99c2a6 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -65,7 +65,7 @@ struct inet_protosw { int protocol; /* This is the L4 protocol number. */ struct proto *prot; - struct proto_ops *ops; + const struct proto_ops *ops; int capability; /* Which (if any) capability do * we need to use this socket @@ -76,6 +76,7 @@ struct inet_protosw { }; #define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */ #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ +#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ extern struct net_protocol *inet_protocol_base; extern struct net_protocol *inet_protos[MAX_INET_PROTOS]; diff --git a/include/net/raw.h b/include/net/raw.h index f47917469b12..e67b28a0248c 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -19,6 +19,8 @@ #include <linux/config.h> +#include <net/protocol.h> + extern struct proto raw_prot; extern void raw_err(struct sock *, struct sk_buff *, u32 info); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b52cc52ffe39..11641c9384f7 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, u32 hash, struct request_sock *req, - unsigned timeout) + unsigned long timeout) { struct listen_sock *lopt = queue->listen_opt; diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8e7794ee27ff..f5c22d77feab 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -277,6 +277,24 @@ struct sctp_sock { __u32 default_context; __u32 default_timetolive; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new associations. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new associations. */ + __u16 pathmaxrxt; + + /* The initial Path MTU to use for new associations. */ + __u32 pathmtu; + + /* The default SACK delay timeout for new associations. */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + struct sctp_initmsg initmsg; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; @@ -845,9 +863,6 @@ struct sctp_transport { /* Data that has been sent, but not acknowledged. */ __u32 flight_size; - /* PMTU : The current known path MTU. */ - __u32 pmtu; - /* Destination */ struct dst_entry *dst; /* Source address. */ @@ -862,7 +877,22 @@ struct sctp_transport { /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. */ - int hb_interval; + __u32 hbinterval; + + /* This is the max_retrans value for the transport and will + * be initialized from the assocs value. This can be changed + * using SCTP_SET_PEER_ADDR_PARAMS socket option. + */ + __u16 pathmaxrxt; + + /* PMTU : The current known path MTU. */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; /* When was the last time (in jiffies) that we heard from this * transport? We use this to pick new active and retran paths. @@ -882,22 +912,11 @@ struct sctp_transport { */ int state; - /* hb_allowed : The current heartbeat state of this destination, - * : i.e. ALLOW-HB, NO-HEARTBEAT, etc. - */ - int hb_allowed; - /* These are the error stats for this destination. */ /* Error count : The current error count for this destination. */ unsigned short error_count; - /* This is the max_retrans value for the transport and will - * be initialized to proto.max_retrans.path. This can be changed - * using SCTP_SET_PEER_ADDR_PARAMS socket option. - */ - int max_retrans; - /* Per : A timer used by each destination. * Destination : * Timer : @@ -1502,6 +1521,28 @@ struct sctp_association { /* The largest timeout or RTO value to use in attempting an INIT */ __u16 max_init_timeo; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new transports. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new transports in the + * association. + */ + __u16 pathmaxrxt; + + /* Association : The smallest PMTU discovered for all of the + * PMTU : peer's transport addresses. + */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + int timeouts[SCTP_NUM_TIMEOUT_TYPES]; struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; @@ -1571,11 +1612,6 @@ struct sctp_association { */ wait_queue_head_t wait; - /* Association : The smallest PMTU discovered for all of the - * PMTU : peer's transport addresses. - */ - __u32 pmtu; - /* The message size at which SCTP fragmentation will occur. */ __u32 frag_point; diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index f1c3bc54526a..8a6bef6f91eb 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -93,6 +93,8 @@ enum sctp_optname { #define SCTP_STATUS SCTP_STATUS SCTP_GET_PEER_ADDR_INFO, #define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO + SCTP_DELAYED_ACK_TIME, +#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -503,13 +505,41 @@ struct sctp_setadaption { * unreachable. The following structure is used to access and modify an * address's parameters: */ +enum sctp_spp_flags { + SPP_HB_ENABLE = 1, /*Enable heartbeats*/ + SPP_HB_DISABLE = 2, /*Disable heartbeats*/ + SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE, + SPP_HB_DEMAND = 4, /*Send heartbeat immediately*/ + SPP_PMTUD_ENABLE = 8, /*Enable PMTU discovery*/ + SPP_PMTUD_DISABLE = 16, /*Disable PMTU discovery*/ + SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE, + SPP_SACKDELAY_ENABLE = 32, /*Enable SACK*/ + SPP_SACKDELAY_DISABLE = 64, /*Disable SACK*/ + SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE, +}; + struct sctp_paddrparams { sctp_assoc_t spp_assoc_id; struct sockaddr_storage spp_address; __u32 spp_hbinterval; __u16 spp_pathmaxrxt; + __u32 spp_pathmtu; + __u32 spp_sackdelay; + __u32 spp_flags; } __attribute__((packed, aligned(4))); +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + */ +struct sctp_assoc_value { + sctp_assoc_t assoc_id; + uint32_t assoc_value; +}; + /* * 7.2.2 Peer Address Information * diff --git a/include/net/sock.h b/include/net/sock.h index 982b4ecd187b..6961700ff3a0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; +struct timewait_sock_ops; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface @@ -557,11 +558,10 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; - kmem_cache_t *twsk_slab; - unsigned int twsk_obj_size; atomic_t *orphan_count; struct request_sock_ops *rsk_prot; + struct timewait_sock_ops *twsk_prot; struct module *owner; @@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk) sk_free(sk); } +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = sk->sk_backlog_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -1166,7 +1189,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) static inline int sock_error(struct sock *sk) { - int err = xchg(&sk->sk_err, 0); + int err; + if (likely(!sk->sk_err)) + return 0; + err = xchg(&sk->sk_err, 0); return -err; } diff --git a/include/net/tcp.h b/include/net/tcp.h index d78025f9fbea..77f21c65bbca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -225,53 +225,6 @@ extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; /* - * Pointers to address related TCP functions - * (i.e. things that depend on the address family) - */ - -struct tcp_func { - int (*queue_xmit) (struct sk_buff *skb, - int ipfragok); - - void (*send_check) (struct sock *sk, - struct tcphdr *th, - int len, - struct sk_buff *skb); - - int (*rebuild_header) (struct sock *sk); - - int (*conn_request) (struct sock *sk, - struct sk_buff *skb); - - struct sock * (*syn_recv_sock) (struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst); - - int (*remember_stamp) (struct sock *sk); - - __u16 net_header_len; - - int (*setsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int optlen); - - int (*getsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int __user *optlen); - - - void (*addr2sockaddr) (struct sock *sk, - struct sockaddr *); - - int sockaddr_len; -}; - -/* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ @@ -334,6 +287,9 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); +extern int tcp_twsk_unique(struct sock *sk, + struct sock *sktw, void *twp); + static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { @@ -405,8 +361,7 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern void tcp_v4_send_check(struct sock *sk, - struct tcphdr *th, int len, +extern void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); extern int tcp_v4_conn_request(struct sock *sk, @@ -490,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -/* Initialize RCV_MSS value. - * RCV_MSS is an our guess about MSS used by the peer. - * We haven't any direct information about the MSS. - * It's better to underestimate the RCV_MSS rather than overestimate. - * Overestimations make us ACKing less frequently than needed. - * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). - */ +extern void tcp_initialize_rcv_mss(struct sock *sk); -static inline void tcp_initialize_rcv_mss(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - - hint = min(hint, tp->rcv_wnd/2); - hint = min(hint, TCP_MIN_RCVMSS); - hint = max(hint, TCP_MIN_MSS); - - inet_csk(sk)->icsk_ack.rcv_mss = hint; -} - -static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } -static __inline__ void tcp_fast_path_on(struct tcp_sock *tp) +static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } @@ -535,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp) * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ -static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp) +static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; @@ -707,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); extern int tcp_set_congestion_control(struct sock *sk, const char *name); +extern void tcp_slow_start(struct tcp_sock *tp); extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); @@ -746,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ -static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return (tp->packets_out - tp->left_out + tp->retrans_out); } @@ -766,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) (tp->snd_cwnd >> 2))); } -/* - * Linear increase during slow start - */ -static inline void tcp_slow_start(struct tcp_sock *tp) -{ - if (sysctl_tcp_abc) { - /* RFC3465: Slow Start - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (tp->bytes_acked < tp->mss_cache) - return; - - /* We MAY increase by 2 if discovered delayed ack */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } - tp->bytes_acked = 0; - - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; -} - - static inline void tcp_sync_left_out(struct tcp_sock *tp) { if (tp->rx_opt.sack_ok && @@ -801,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -/* Set slow start threshold and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - - tp->undo_marker = 0; - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - tp->snd_cwnd_stamp = tcp_time_stamp; - TCP_ECN_queue_cwr(tp); -} - -static inline void tcp_enter_cwr(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->prior_ssthresh = 0; - tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - } -} - +extern void tcp_enter_cwr(struct sock *sk); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); /* Slow start with delack produces 3 packets of burst, so that @@ -860,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) return left <= tcp_max_burst(tp); } -static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, - const struct sk_buff *skb) +static inline void tcp_minshall_update(struct tcp_sock *tp, int mss, + const struct sk_buff *skb) { if (skb->len < mss) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { const struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out && !icsk->icsk_pending) @@ -875,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t icsk->icsk_rto, TCP_RTO_MAX); } -static __inline__ void tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp) +static inline void tcp_push_pending_frames(struct sock *sk, + struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } -static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } @@ -894,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) /* * Calculate(/check) TCP checksum */ -static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr, - unsigned long base) +static inline u16 tcp_v4_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, + unsigned long base) { return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); } -static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) +static inline int __tcp_checksum_complete(struct sk_buff *skb) { return __skb_checksum_complete(skb); } -static __inline__ int tcp_checksum_complete(struct sk_buff *skb) +static inline int tcp_checksum_complete(struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete(skb); @@ -914,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb) /* Prequeue for VJ style copy to user, combined with checksumming. */ -static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) +static inline void tcp_prequeue_init(struct tcp_sock *tp) { tp->ucopy.task = NULL; tp->ucopy.len = 0; @@ -930,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) * * NOTE: is this not too big to inline? */ -static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) +static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -971,7 +855,7 @@ static const char *statename[]={ }; #endif -static __inline__ void tcp_set_state(struct sock *sk, int state) +static inline void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; @@ -1005,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) #endif } -static __inline__ void tcp_done(struct sock *sk) +static inline void tcp_done(struct sock *sk) { tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); @@ -1018,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk) inet_csk_destroy_sock(sk); } -static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) +static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->eff_sacks = 0; rx_opt->num_sacks = 0; } -static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp) -{ - if (tp->rx_opt.tstamp_ok) { - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; - - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -} - -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - */ -static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if(sack) - *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if(sack) - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); -} - /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, @@ -1117,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static __inline__ void tcp_openreq_init(struct request_sock *req, - struct tcp_options_received *rx_opt, - struct sk_buff *skb) +static inline void tcp_openreq_init(struct request_sock *req, + struct tcp_options_received *rx_opt, + struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index b9d4176b2d15..b0b645988bd8 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -31,4 +31,20 @@ enum { #define TCP_STATE_MASK 0xF +#define TCP_ACTION_FIN (1 << 7) + +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11) +}; + #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h new file mode 100644 index 000000000000..2544281e1d5e --- /dev/null +++ b/include/net/timewait_sock.h @@ -0,0 +1,31 @@ +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _TIMEWAIT_SOCK_H +#define _TIMEWAIT_SOCK_H + +#include <linux/slab.h> +#include <net/sock.h> + +struct timewait_sock_ops { + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + int (*twsk_unique)(struct sock *sk, + struct sock *sktw, void *twp); +}; + +static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + if (sk->sk_prot->twsk_prot->twsk_unique != NULL) + return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); + return 0; +} + +#endif /* _TIMEWAIT_SOCK_H */ diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 4e86f2de6638..61f724c1036f 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -44,7 +44,7 @@ extern int datagram_send_ctl(struct msghdr *msg, /* * address family specific functions */ -extern struct tcp_func ipv4_specific; +extern struct inet_connection_sock_af_ops ipv4_specific; extern int inet6_destroy_sock(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index 107b9d791a1f..766fba1369ce 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -22,9 +22,8 @@ #ifndef _UDP_H #define _UDP_H -#include <linux/udp.h> -#include <linux/ip.h> #include <linux/list.h> +#include <net/inet_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <linux/seq_file.h> @@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num) extern struct proto udp_prot; +struct sk_buff; extern void udp_err(struct sk_buff *, u32); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1cdb87912137..07d7b50cdd76 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2,11 +2,12 @@ #define _NET_XFRM_H #include <linux/compiler.h> +#include <linux/in.h> #include <linux/xfrm.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> +#include <linux/socket.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/in6.h> @@ -144,6 +145,9 @@ struct xfrm_state * transformer. */ struct xfrm_type *type; + /* Security context */ + struct xfrm_sec_ctx *security; + /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; @@ -298,6 +302,7 @@ struct xfrm_policy __u8 flags; __u8 dead; __u8 xfrm_nr; + struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; }; @@ -510,6 +515,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl, return 0; } +#ifdef CONFIG_SECURITY_NETWORK_XFRM +/* If neither has a context --> match + * Otherwise, both must have a context and the sids, doi, alg must match + */ +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return ((!s1 && !s2) || + (s1 && s2 && + (s1->ctx_sid == s2->ctx_sid) && + (s1->ctx_doi == s2->ctx_doi) && + (s1->ctx_alg == s2->ctx_alg))); +} +#else +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return 1; +} +#endif + /* A struct encoding bundle of transformations to apply to some set of flow. * * dst->child points to the next element of bundle. @@ -878,8 +902,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete); +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete); void xfrm_policy_flush(void); u32 xfrm_get_acqseq(void); diff --git a/init/main.c b/init/main.c index 27f97f9b4636..54aaf561cf66 100644 --- a/init/main.c +++ b/init/main.c @@ -47,7 +47,6 @@ #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> -#include <net/sock.h> #include <asm/io.h> #include <asm/bugs.h> @@ -614,9 +613,6 @@ static void __init do_basic_setup(void) sysctl_init(); #endif - /* Networking initialization needs a process context */ - sock_init(); - do_initcalls(); } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7982656b9c83..a5144e43aae1 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -63,7 +63,7 @@ #include <linux/atalk.h> struct datalink_proto *ddp_dl, *aarp_dl; -static struct proto_ops atalk_dgram_ops; +static const struct proto_ops atalk_dgram_ops; /**************************************************************************\ * * @@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr */ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - int rc = -EINVAL; + int rc = -ENOIOCTLCMD; struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; @@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = atif_ioctl(cmd, argp); rtnl_unlock(); break; - /* Physical layer ioctl calls */ - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCGIFTXQLEN: - case SIOCSIFTXQLEN: - case SIOCGIFMTU: - case SIOCGIFCONF: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCGIFCOUNT: - case SIOCGIFINDEX: - case SIOCGIFNAME: - rc = dev_ioctl(cmd, argp); - break; } return rc; @@ -1841,7 +1824,7 @@ static struct net_proto_family atalk_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { .family = PF_APPLETALK, .owner = THIS_MODULE, .release = atalk_release, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2684a92da22b..f2c541774dcd 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, } -static struct proto_ops pvc_proto_ops = { +static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, diff --git a/net/atm/svc.c b/net/atm/svc.c index d7b266136bf6..3a180cfd7b48 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return error; } -static struct proto_ops svc_proto_ops = { +static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1b683f302657..e8753c7fcad1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -54,7 +54,7 @@ HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); -static struct proto_ops ax25_proto_ops; +static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { @@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - res = dev_ioctl(cmd, argp); + res = -ENOIOCTLCMD; break; } release_sock(sk); @@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops ax25_proto_ops = { +static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ea616e3fc98e..fb031fe9be9e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) timeo = schedule_timeout(timeo); lock_sock(sk); - if (sk->sk_err) { - err = sock_error(sk); + err = sock_error(sk); + if (err) break; - } } set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9778c6acd53b..ccbaf69afc5b 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } -static struct proto_ops bnep_sock_ops = { +static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index beb045bf5714..5e22343b6090 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops cmtp_sock_ops = { +static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d6d0a15c099..84e6c93a044a 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char return 0; } -static struct proto_ops hci_sock_ops = { +static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index f8986f881431..8f8dd931b294 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops hidp_sock_ops = { +static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e3bb11ca4235..7f0781e4326f 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -57,7 +57,7 @@ #define VERSION "2.8" -static struct proto_ops l2cap_sock_ops; +static const struct proto_ops l2cap_sock_ops; static struct bt_sock_list l2cap_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -2160,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); -static struct proto_ops l2cap_sock_ops = { +static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 6c34261b232e..757d2dd3b02f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -58,7 +58,7 @@ #define BT_DBG(D...) #endif -static struct proto_ops rfcomm_sock_ops; +static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); -static struct proto_ops rfcomm_sock_ops = { +static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9cb00dc6c08c..6b61323ce23c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -56,7 +56,7 @@ #define VERSION "0.5" -static struct proto_ops sco_sock_ops; +static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); -static struct proto_ops sco_sock_ops = { +static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, diff --git a/net/bridge/br.c b/net/bridge/br.c index f8f184942aaf..188cc1ac49eb 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook); module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); +MODULE_VERSION(BR_VERSION); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f564ee99782d..0b33a7b3a00c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -15,7 +15,9 @@ #include <linux/kernel.h> #include <linux/netdevice.h> -#include <linux/module.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> + #include <asm/uaccess.h> #include "br_private.h" @@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) return 0; } +/* Allow setting mac address of pseudo-bridge to be same as + * any of the bound interfaces + */ +static int br_set_mac_address(struct net_device *dev, void *p) +{ + struct net_bridge *br = netdev_priv(dev); + struct sockaddr *addr = p; + struct net_bridge_port *port; + int err = -EADDRNOTAVAIL; + + spin_lock_bh(&br->lock); + list_for_each_entry(port, &br->port_list, list) { + if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) { + br_stp_change_bridge_id(br, addr->sa_data); + err = 0; + break; + } + } + spin_unlock_bh(&br->lock); + + return err; +} + +static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "bridge"); + strcpy(info->version, BR_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "N/A"); +} + +static int br_set_sg(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_SG; + else + br->feature_mask &= ~NETIF_F_SG; + + br_features_recompute(br); + return 0; +} + +static int br_set_tso(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_TSO; + else + br->feature_mask &= ~NETIF_F_TSO; + + br_features_recompute(br); + return 0; +} + +static int br_set_tx_csum(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_IP_CSUM; + else + br->feature_mask &= ~NETIF_F_IP_CSUM; + + br_features_recompute(br); + return 0; +} + +static struct ethtool_ops br_ethtool_ops = { + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = br_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = br_set_tx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = br_set_tso, +}; + void br_dev_setup(struct net_device *dev) { memset(dev->dev_addr, 0, ETH_ALEN); @@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev) dev->change_mtu = br_change_mtu; dev->destructor = free_netdev; SET_MODULE_OWNER(dev); + SET_ETHTOOL_OPS(dev, &br_ethtool_ops); dev->stop = br_dev_stop; dev->tx_queue_len = 0; - dev->set_mac_address = NULL; + dev->set_mac_address = br_set_mac_address; dev->priv_flags = IFF_EBRIDGE; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 975abe254b7a..11321197338e 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -32,9 +32,8 @@ * ethtool, use ethtool_ops. Also, since driver might sleep need to * not be holding any locks. */ -static int br_initial_port_cost(struct net_device *dev) +static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; struct ifreq ifr; mm_segment_t old_fs; @@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev) return 2; case SPEED_10: return 100; - default: - pr_info("bridge: can't decode speed from %s: %d\n", - dev->name, ecmd.speed); - return 100; } } @@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev) return 100; /* assume old 10Mbps */ } + +/* + * Check for port carrier transistions. + * Called from work queue to allow for calling functions that + * might sleep (such as speed check), and to debounce. + */ +static void port_carrier_check(void *arg) +{ + struct net_bridge_port *p = arg; + + rtnl_lock(); + if (netif_carrier_ok(p->dev)) { + u32 cost = port_cost(p->dev); + + spin_lock_bh(&p->br->lock); + if (p->state == BR_STATE_DISABLED) { + p->path_cost = cost; + br_stp_enable_port(p); + } + spin_unlock_bh(&p->br->lock); + } else { + spin_lock_bh(&p->br->lock); + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + spin_unlock_bh(&p->br->lock); + } + rtnl_unlock(); +} + static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; @@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p) dev->br_port = NULL; dev_set_promiscuity(dev, -1); + cancel_delayed_work(&p->carrier_check); + flush_scheduled_work(); + spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); @@ -155,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name) br->bridge_id.prio[1] = 0x00; memset(br->bridge_id.addr, 0, ETH_ALEN); + br->feature_mask = dev->features; br->stp_enabled = 0; br->designated_root = br->bridge_id; br->root_path_cost = 0; @@ -195,10 +223,9 @@ static int find_portno(struct net_bridge *br) return (index >= BR_MAX_PORTS) ? -EXFULL : index; } -/* called with RTNL */ +/* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, - struct net_device *dev, - unsigned long cost) + struct net_device *dev) { int index; struct net_bridge_port *p; @@ -215,12 +242,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->br = br; dev_hold(dev); p->dev = dev; - p->path_cost = cost; + p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; dev->br_port = p; p->port_no = index; br_init_port(p); p->state = BR_STATE_DISABLED; + INIT_WORK(&p->carrier_check, port_carrier_check, p); kobject_init(&p->kobj); return p; @@ -322,9 +350,8 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO; - checksum = NETIF_F_IP_CSUM; /* least commmon subset */ + features = br->feature_mask &~ NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { if (!(p->dev->features @@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (dev->br_port != NULL) return -EBUSY; - if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) + if (IS_ERR(p = new_nbp(br, dev))) return PTR_ERR(p); if ((err = br_fdb_insert(br, p, dev->dev_addr))) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b88220a64cd8..c387852f753a 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + if (p->state == BR_STATE_LEARNING) { + kfree_skb(skb); + goto out; + } + if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; @@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; - if (p->state == BR_STATE_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source); - if (p->br->stp_enabled && !memcmp(dest, bridge_ula, 5) && !(dest[5] & 0xF0)) { @@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) NULL, br_stp_handle_bpdu); return 1; } + goto err; } - else if (p->state == BR_STATE_FORWARDING) { + if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { if (br_should_route_hook) { if (br_should_route_hook(pskb)) return 0; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 23422bd53a5e..223f8270daee 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -26,6 +26,7 @@ #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/netfilter_bridge.h> @@ -33,8 +34,11 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> + #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> + #include <asm/uaccess.h> #include <asm/checksum.h> #include "br_private.h" diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 917311c6828b..a43a9c1d50d7 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_stp_recalculate_bridge_id(br); break; - case NETDEV_CHANGE: /* device is up but carrier changed */ - if (!(br->dev->flags & IFF_UP)) - break; - - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } + case NETDEV_CHANGE: + if (br->dev->flags & IFF_UP) + schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE); break; case NETDEV_FEAT_CHANGE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index bdf95a74d8cd..c5bd631ffcd5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -27,6 +27,10 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_PORT_DEBOUNCE (HZ/10) + +#define BR_VERSION "2.1" + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -78,6 +82,7 @@ struct net_bridge_port struct timer_list hold_timer; struct timer_list message_age_timer; struct kobject kobj; + struct work_struct carrier_check; struct rcu_head rcu; }; @@ -90,6 +95,7 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; + unsigned long feature_mask; /* STP */ bridge_id designated_root; @@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); extern void br_stp_set_port_priority(struct net_bridge_port *p, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ac09b6a23523..cc047f7fb6ef 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p) } /* called under bridge lock */ -static void br_stp_change_bridge_id(struct net_bridge *br, - const unsigned char *addr) +void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { unsigned char oldaddr[6]; struct net_bridge_port *p; @@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || - compare_ether_addr(p->dev->dev_addr, addr) < 0) + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c70b3be23026..b84fc6075fe1 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. config BRIDGE_EBT_ULOG - tristate "ebt: ulog support" + tristate "ebt: ulog support (OBSOLETE)" depends on BRIDGE_NF_EBTABLES help + This option enables the old bridge-specific "ebt_ulog" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + This option adds the ulog watcher, that you can use in any rule in any ebtables table. The packet is passed to a userspace logging daemon using netlink multicast sockets. This differs diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 662975be3d1d..9f6e0193ae10 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -3,13 +3,16 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * April, 2002 * */ +#include <linux/in.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_log.h> +#include <linux/netfilter.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/if_arp.h> @@ -55,27 +58,30 @@ static void print_MAC(unsigned char *p) } #define myNIPQUAD(a) a[0], a[1], a[2], a[3] -static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, - const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) +static void +ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { - struct ebt_log_info *info = (struct ebt_log_info *)data; - char level_string[4] = "< >"; + unsigned int bitmask; - level_string[1] = '0' + info->loglevel; spin_lock_bh(&ebt_log_lock); - printk(level_string); - printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", - out ? out->name : ""); + printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); - printk("MAC source = "); print_MAC(eth_hdr(skb)->h_source); printk("MAC dest = "); print_MAC(eth_hdr(skb)->h_dest); printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); - if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + if (loginfo->type == NF_LOG_TYPE_LOG) + bitmask = loginfo->u.log.logflags; + else + bitmask = NF_LOG_MASK; + + if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)){ struct iphdr _iph, *ih; @@ -84,10 +90,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, printk(" INCOMPLETE IP header"); goto out; } - printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", - NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); - printk(" IP tos=0x%02X, IP proto=%d", ih->tos, - ih->protocol); + printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP " + "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), ih->tos, ih->protocol); if (ih->protocol == IPPROTO_TCP || ih->protocol == IPPROTO_UDP) { struct tcpudphdr _ports, *pptr; @@ -104,7 +109,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, goto out; } - if ((info->bitmask & EBT_LOG_ARP) && + if ((bitmask & EBT_LOG_ARP) && ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { struct arphdr _arph, *ah; @@ -144,6 +149,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, out: printk("\n"); spin_unlock_bh(&ebt_log_lock); + +} + +static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = info->loglevel; + li.u.log.logflags = info->bitmask; + + nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix); } static struct ebt_watcher log = @@ -154,13 +174,32 @@ static struct ebt_watcher log = .me = THIS_MODULE, }; +static struct nf_logger ebt_log_logger = { + .name = "ebt_log", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { - return ebt_register_watcher(&log); + int ret; + + ret = ebt_register_watcher(&log); + if (ret < 0) + return ret; + if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) { + printk(KERN_WARNING "ebt_log: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + + return 0; } static void __exit fini(void) { + nf_log_unregister_logger(&ebt_log_logger); ebt_unregister_watcher(&log); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index aae26ae2e61f..ce617b3dbbb8 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -3,6 +3,7 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * November, 2004 * @@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, +static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) + const struct ebt_ulog_info *uloginfo, const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ulog_buffers[group]; spinlock_t *lock = &ub->lock; @@ -216,6 +216,39 @@ alloc_failure: goto unlock; } +/* this function is registered with the netfilter core */ +static void ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *li, + const char *prefix) +{ + struct ebt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; + loginfo.cprange = 0; + loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nlgroup = li->u.ulog.group; + loginfo.cprange = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + + ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL); +} + + static int ebt_ulog_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { @@ -240,6 +273,12 @@ static struct ebt_watcher ulog = { .me = THIS_MODULE, }; +static struct nf_logger ebt_ulog_logger = { + .name = EBT_ULOG_WATCHER, + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { int i, ret = 0; @@ -265,6 +304,13 @@ static int __init init(void) else if ((ret = ebt_register_watcher(&ulog))) sock_release(ebtulognl->sk_socket); + if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) { + printk(KERN_WARNING "ebt_ulog: not logging via ulog " + "since somebody else already registered for PF_BRIDGE\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + return ret; } @@ -273,6 +319,7 @@ static void __exit fini(void) ebt_ulog_buff_t *ub; int i; + nf_log_unregister_logger(&ebt_ulog_logger); ebt_unregister_watcher(&ulog); for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { ub = &ulog_buffers[i]; diff --git a/net/core/datagram.c b/net/core/datagram.c index 1bcfef51ac58..f8d322e1ea92 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> +#include <linux/spinlock.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } /** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + */ + +void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + if (flags & MSG_PEEK) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); +} + +EXPORT_SYMBOL(skb_kill_datagram); + +/** * skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index a5efc9ae010b..29ba109d3e54 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); diff --git a/net/core/filter.c b/net/core/filter.c index 3a10e0bc90e8..8964d3445588 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() */ #include <linux/module.h> @@ -250,7 +251,7 @@ load_b: mem[fentry->k] = X; continue; default: - /* Invalid instruction counts as RET */ + WARN_ON(1); return 0; } @@ -283,8 +284,8 @@ load_b: * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain - * no references or jumps that are out of range, no illegal instructions - * and no backward jumps. It must end with a RET instruction + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. * * Returns 0 if the rule set is legal or a negative errno code if not. */ @@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen) for (pc = 0; pc < flen; pc++) { /* all jumps are forward as they are not signed */ ftest = &filter[pc]; - if (BPF_CLASS(ftest->code) == BPF_JMP) { - /* but they mustn't jump off the end */ - if (BPF_OP(ftest->code) == BPF_JA) { - /* - * Note, the large ftest->k might cause loops. - * Compare this with conditional jumps below, - * where offsets are limited. --ANK (981016) - */ - if (ftest->k >= (unsigned)(flen-pc-1)) - return -EINVAL; - } else { - /* for conditionals both must be safe */ - if (pc + ftest->jt +1 >= flen || - pc + ftest->jf +1 >= flen) - return -EINVAL; - } - } - /* check for division by zero -Kris Katterjohn 2005-10-30 */ - if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) - return -EINVAL; + /* Only allow valid instructions */ + switch (ftest->code) { + case BPF_ALU|BPF_ADD|BPF_K: + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_NEG: + case BPF_LD|BPF_W|BPF_ABS: + case BPF_LD|BPF_H|BPF_ABS: + case BPF_LD|BPF_B|BPF_ABS: + case BPF_LD|BPF_W|BPF_LEN: + case BPF_LD|BPF_W|BPF_IND: + case BPF_LD|BPF_H|BPF_IND: + case BPF_LD|BPF_B|BPF_IND: + case BPF_LD|BPF_IMM: + case BPF_LDX|BPF_W|BPF_LEN: + case BPF_LDX|BPF_B|BPF_MSH: + case BPF_LDX|BPF_IMM: + case BPF_MISC|BPF_TAX: + case BPF_MISC|BPF_TXA: + case BPF_RET|BPF_K: + case BPF_RET|BPF_A: + break; + + /* Some instructions need special checks */ - /* check that memory operations use valid addresses. */ - if (ftest->k >= BPF_MEMWORDS) { - /* but it might not be a memory operation... */ - switch (ftest->code) { - case BPF_ST: - case BPF_STX: - case BPF_LD|BPF_MEM: - case BPF_LDX|BPF_MEM: + case BPF_ALU|BPF_DIV|BPF_K: + /* check for division by zero */ + if (ftest->k == 0) return -EINVAL; - } + break; + + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + case BPF_ST: + case BPF_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + + case BPF_JMP|BPF_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + + case BPF_JMP|BPF_JEQ|BPF_K: + case BPF_JMP|BPF_JEQ|BPF_X: + case BPF_JMP|BPF_JGE|BPF_K: + case BPF_JMP|BPF_JGE|BPF_X: + case BPF_JMP|BPF_JGT|BPF_K: + case BPF_JMP|BPF_JGT|BPF_X: + case BPF_JMP|BPF_JSET|BPF_K: + case BPF_JMP|BPF_JSET|BPF_X: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + + default: + return -EINVAL; } } diff --git a/net/core/flow.c b/net/core/flow.c index 7e95b39de9fd..c4f25385029f 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -23,6 +23,7 @@ #include <net/flow.h> #include <asm/atomic.h> #include <asm/semaphore.h> +#include <linux/security.h> struct flow_cache_entry { struct flow_cache_entry *next; @@ -30,6 +31,7 @@ struct flow_cache_entry { u8 dir; struct flowi key; u32 genid; + u32 sk_sid; void *object; atomic_t *object_ref; }; @@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, flow_resolve_t resolver) { struct flow_cache_entry *fle, **head; @@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && + fle->sk_sid == sk_sid && flow_key_compare(key, &fle->key) == 0) { if (fle->genid == atomic_read(&flow_cache_genid)) { void *ret = fle->object; @@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, *head = fle; fle->family = family; fle->dir = dir; + fle->sk_sid = sk_sid; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; flow_count(cpu)++; @@ -221,7 +225,7 @@ nocache: void *obj; atomic_t *obj_ref; - resolver(key, family, dir, &obj, &obj_ref); + resolver(key, sk_sid, family, dir, &obj, &obj_ref); if (fle) { fle->genid = atomic_read(&flow_cache_genid); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 49424a42a2c0..281a632fa6a6 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> +#include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7fc3e9e28c34..06cad2d63e8a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]); /* Module parameters, defaults. */ static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d = 0; -static int pg_clone_skb_d = 0; -static int debug = 0; +static int pg_delay_d; +static int pg_clone_skb_d; +static int debug; static DECLARE_MUTEX(pktgen_sem); static struct pktgen_thread *pktgen_threads = NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83fee37de38e..070f91cfde59 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone) { + struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; /* Get the HEAD */ - if (fclone) - skb = kmem_cache_alloc(skbuff_fclone_cache, - gfp_mask & ~__GFP_DMA); - else - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - + skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache, + gfp_mask & ~__GFP_DMA); if (!skb) goto out; @@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb->tail = data; skb->end = data + size; + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->tso_size = 0; + shinfo->tso_segs = 0; + shinfo->ufo_size = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + if (fclone) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, child->fclone = SKB_FCLONE_UNAVAILABLE; } - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; - skb_shinfo(skb)->ufo_size = 0; - skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: diff --git a/net/core/sock.c b/net/core/sock.c index 13cc3be4f056..6465b0e4c8cb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab) } } - if (prot->twsk_obj_size) { + if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); @@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab; sprintf(timewait_sock_slab_name, mask, prot->name); - prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, - prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (prot->twsk_slab == NULL) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } @@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } - if (prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_slab); + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_slab); + kmem_cache_destroy(prot->twsk_prot->twsk_slab); kfree(name); - prot->twsk_slab = NULL; + prot->twsk_prot->twsk_slab = NULL; } } diff --git a/net/core/stream.c b/net/core/stream.c index 15bfd03e8024..35e25259fd95 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int done; do { - if (sk->sk_err) - return sock_error(sk); + int err = sock_error(sk); + if (err) + return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, + !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); @@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + sk_wait_event(sk, ¤t_timeo, !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 344a8da153fc..87b27fff6e3b 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,3 +1,7 @@ +obj-$(CONFIG_IPV6) += dccp_ipv6.o + +dccp_ipv6-y := ipv6.o + obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c9a62cca22fc..ce9cb77c5c29 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) from = av->dccpav_buf + av->dccpav_buf_head; /* Check if buf_head wraps */ - if (av->dccpav_buf_head + len > av->dccpav_vec_len) { - const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); + if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) { + const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head; memcpy(to, from, tailsize); to += tailsize; @@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, const gfp_t priority) { - struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); + struct dccp_ackvec *av; + BUG_ON(len == 0); + + if (len > DCCP_MAX_ACKVEC_LEN) + return NULL; + + av = kmalloc(sizeof(*av) + len, priority); if (av != NULL) { av->dccpav_buf_len = len; av->dccpav_buf_head = @@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av) } static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; } static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; } @@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, */ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, const unsigned int packets, - const unsigned char state) + const unsigned char state) { unsigned int gap; signed long new_head; @@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); - unsigned int index = av->dccpav_buf_head; + u8 index = av->dccpav_buf_head; while (1) { const u8 len = dccp_ackvec_len(av, index); @@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av) } #endif -static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) +static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av) { /* * As we're keeping track of the ack vector size (dccpav_vec_len) and @@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) * draft-ietf-dccp-spec-11.txt Appendix A. -acme */ #if 0 - av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; - if (av->dccpav_buf_tail >= av->dccpav_vec_len) - av->dccpav_buf_tail -= av->dccpav_vec_len; + u32 new_buf_tail = av->dccpav_ack_ptr + 1; + if (new_buf_tail >= av->dccpav_vec_len) + new_buf_tail -= av->dccpav_vec_len; + av->dccpav_buf_tail = new_buf_tail; #endif av->dccpav_vec_len -= av->dccpav_sent_len; } @@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, debug_prefix, 1, (unsigned long long)av->dccpav_ack_seqno, (unsigned long long)av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; } } @@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, av->dccpav_ack_seqno, (unsigned long long) av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); } /* * If dccpav_ack_seqno was not received, no problem diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index d0fd6c60c574..f7dfb5f67b87 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -54,16 +54,16 @@ * @dccpav_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - unsigned int dccpav_buf_head; - unsigned int dccpav_buf_tail; u64 dccpav_buf_ackno; u64 dccpav_ack_seqno; u64 dccpav_ack_ackno; - unsigned int dccpav_ack_ptr; - unsigned int dccpav_sent_len; - unsigned int dccpav_vec_len; - unsigned int dccpav_buf_len; struct timeval dccpav_time; + u8 dccpav_buf_head; + u8 dccpav_buf_tail; + u8 dccpav_ack_ptr; + u8 dccpav_sent_len; + u8 dccpav_vec_len; + u8 dccpav_buf_len; u8 dccpav_buf_nonce; u8 dccpav_ack_nonce; u8 dccpav_buf[0]; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c37eeeaf5c6e..de681c6ad081 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -21,6 +21,8 @@ #define CCID_MAX 255 +struct tcp_info; + struct ccid { unsigned char ccid_id; const char *ccid_name; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f97b85d55ad8..93f26dd6e6cb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); #define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ -extern struct proto dccp_v4_prot; +extern struct proto dccp_prot; /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) @@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len); +extern int dccp_v4_init_sock(struct sock *sk); +extern int dccp_v4_destroy_sock(struct sock *sk); + extern void dccp_close(struct sock *sk, long timeout); extern struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, @@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk, extern int dccp_connect(struct sock *sk); extern int dccp_disconnect(struct sock *sk, int flags); +extern void dccp_unhash(struct sock *sk); extern int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); extern int dccp_setsockopt(struct sock *sk, int level, int optname, @@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void dccp_shutdown(struct sock *sk, int how); +extern int inet_dccp_listen(struct socket *sock, int backlog); +extern unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); +extern void dccp_v4_send_check(struct sock *sk, int len, + struct sk_buff *skb); +extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len); extern int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr); @@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb, extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); +extern int dccp_invalid_packet(struct sk_buff *skb); + +static inline int dccp_bad_service_code(const struct sock *sk, + const __u32 service) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_service == service) + return 0; + return !dccp_list_has_service(dp->dccps_service_list, service); +} struct dccp_skb_cb { __u8 dccpd_type:4; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index f675d8e642d3..3f78c00e3822 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = dp->dccps_pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; if (dp->dccps_options.dccpo_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; diff --git a/net/dccp/input.c b/net/dccp/input.c index 3454d5941900..b6cba72b44e8 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) return 0; } -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned len) +static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, skb)) - goto discard; - - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_options.dccpo_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto discard; - - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: @@ -250,6 +233,37 @@ discard: return 0; } +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + return __dccp_rcv_established(sk, skb, dh, len); +discard: + __kfree_skb(skb); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rcv_established); + static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, @@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_update_gsr(sk, dp->dccps_isr); /* @@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) @@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dccp_set_state(sk, DCCP_PARTOPEN); /* Make sure socket is routed, for correct metrics. */ - inet_sk_rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { - dccp_rcv_established(sk, skb, dh, len); + __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued - (by dccp_rcv_established) */ + (by __dccp_rcv_established) */ } break; } @@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (dccp_v4_conn_request(sk, skb) < 0) + if (inet_csk(sk)->icsk_af_ops->conn_request(sk, + skb) < 0) return 1; /* FIXME: do congestion control initialization */ @@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - if (dp->dccps_options.dccpo_send_ack_vector && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } /* @@ -566,3 +587,5 @@ discard: } return 0; } + +EXPORT_SYMBOL_GPL(dccp_rcv_state_process); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 656e13e38cfb..3f244670764a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -19,7 +19,9 @@ #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/inet_sock.h> #include <net/sock.h> +#include <net/timewait_sock.h> #include <net/tcp_states.h> #include <net/xfrm.h> @@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { - return inet_csk_get_port(&dccp_hashinfo, sk, snum); + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void dccp_v4_hash(struct sock *sk) @@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk) inet_hash(&dccp_hashinfo, sk); } -static void dccp_v4_unhash(struct sock *sk) +void dccp_unhash(struct sock *sk) { inet_unhash(&dccp_hashinfo, sk); } -/* called with local bh disabled */ -static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const u32 daddr = inet->rcv_saddr; - const u32 saddr = inet->daddr; - const int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - const struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } +EXPORT_SYMBOL_GPL(dccp_unhash); - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp != NULL) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -/* - * Bind a port for a connect operation and hash it. - */ -static int dccp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - do { - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == rover) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v4_check_established(sk, - rover, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, rover); - if (tb == NULL) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - /* All locks still held and bhs disabled */ - inet_bind_hash(sk, tb, rover); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(rover); - __inet_hash(&dccp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw != NULL) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { - __inet_hash(&dccp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - -static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) +int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin_port; inet->daddr = daddr; - dp->dccps_ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt != NULL) - dp->dccps_ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket @@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v4_hash_connect(sk); + err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; @@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, usin->sin_port); dccp_update_gss(sk, dp->dccps_iss); - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - */ - dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); - inet->id = dp->dccps_iss ^ jiffies; err = dccp_connect(sk); @@ -316,6 +152,8 @@ failure: goto out; } +EXPORT_SYMBOL_GPL(dccp_v4_connect); + /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - dp->dccps_pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* @@ -606,6 +444,17 @@ out: sock_put(sk); } +/* This routine computes an IPv4 DCCP checksum. */ +void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); +} + +EXPORT_SYMBOL_GPL(dccp_v4_send_check); + int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk, dccp_hdr(skb)->dccph_sport); } -static inline int dccp_bad_service_code(const struct sock *sk, - const __u32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return 0; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) const __u32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; - struct dst_entry *dst = NULL; /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ if (((struct rtable *)skb->dst)->rt_flags & @@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - /* FIXME: Merge Aristeu's option parsing code when ready */ req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ ireq->opt = NULL; @@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); dreq->dreq_service = service; - if (dccp_v4_send_response(sk, req, dst)) + if (dccp_v4_send_response(sk, req, NULL)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: - /* - * FIXME: should be reqsk_free after implementing req->rsk_ops - */ - __reqsk_free(req); + reqsk_free(req); drop: DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); dcb->dccpd_reset_code = reset_code; return -1; } +EXPORT_SYMBOL_GPL(dccp_v4_conn_request); + /* * The three way handshake has completed - we got a valid ACK or DATAACK - * now create the new socket. @@ -792,6 +628,8 @@ exit: return NULL; } +EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); + static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); @@ -1011,7 +849,9 @@ discard: return 0; } -static inline int dccp_invalid_packet(struct sk_buff *skb) +EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); + +int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; @@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; } - /* If the header checksum is incorrect, drop packet and return */ - if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, - skb->nh.iph->daddr) < 0) { - LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " - "incorrect\n"); - return 1; - } - return 0; } +EXPORT_SYMBOL_GPL(dccp_invalid_packet); + /* this is called when real data arrives */ int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; struct sock *sk; - int rc; /* Step 1: Check header basics: */ if (dccp_invalid_packet(skb)) goto discard_it; + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n", + __FUNCTION__); + goto discard_it; + } + dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); @@ -1143,28 +984,10 @@ int dccp_v4_rcv(struct sk_buff *skb) goto do_time_wait; } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - dccp_pr_debug("xfrm4_policy_check failed\n"); + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - } - - if (sk_filter(sk, skb, 0)) { - dccp_pr_debug("sk_filter failed\n"); - goto discard_and_relse; - } - - skb->dev = NULL; - - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v4_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - return rc; + return sk_receive_skb(sk, skb); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -1194,9 +1017,23 @@ do_time_wait: goto no_dccp_socket; } -static int dccp_v4_init_sock(struct sock *sk) +struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v4_conn_request, + .syn_recv_sock = dccp_v4_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + +int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); static int dccp_ctl_socket_init = 1; dccp_options_init(&dp->dccps_options); @@ -1236,9 +1073,11 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); - inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + icsk->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; @@ -1246,7 +1085,9 @@ static int dccp_v4_init_sock(struct sock *sk) return 0; } -static int dccp_v4_destroy_sock(struct sock *sk) +EXPORT_SYMBOL_GPL(dccp_v4_init_sock); + +int dccp_v4_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1279,6 +1120,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock); + static void dccp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); @@ -1293,7 +1136,11 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; -struct proto dccp_v4_prot = { +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + +struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, .close = dccp_close, @@ -1307,7 +1154,7 @@ struct proto dccp_v4_prot = { .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v4_do_rcv, .hash = dccp_v4_hash, - .unhash = dccp_v4_unhash, + .unhash = dccp_unhash, .accept = inet_csk_accept, .get_port = dccp_v4_get_port, .shutdown = dccp_shutdown, @@ -1316,5 +1163,7 @@ struct proto dccp_v4_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), + .twsk_prot = &dccp_timewait_sock_ops, }; + +EXPORT_SYMBOL_GPL(dccp_prot); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c new file mode 100644 index 000000000000..c609dc78f487 --- /dev/null +++ b/net/dccp/ipv6.c @@ -0,0 +1,1261 @@ +/* + * DCCP over IPv6 + * Linux INET6 implementation + * + * Based on net/dccp6/ipv6.c + * + * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/xfrm.h> + +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/inet_sock.h> +#include <net/inet6_connection_sock.h> +#include <net/inet6_hashtables.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/xfrm.h> + +#include "dccp.h" +#include "ipv6.h" + +static void dccp_v6_ctl_send_reset(struct sk_buff *skb); +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req); +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); + +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +static struct inet_connection_sock_af_ops dccp_ipv6_mapped; +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops; + +static int dccp_v6_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet6_csk_bind_conflict); +} + +static void dccp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != DCCP_CLOSED) { + if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { + dccp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(&dccp_hashinfo, sk); + local_bh_enable(); + } +} + +static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base); +} + +static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + + if (skb->protocol == htons(ETH_P_IPV6)) + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + dh->dccph_dport, + dh->dccph_sport); + else + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dh->dccph_dport, + dh->dccph_sport); +} + +static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if (ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * DCCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &dccp_ipv6_mapped; + sk->sk_backlog_rcv = dccp_v4_do_rcv; + + err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_backlog_rcv = dccp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto failure; + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + inet->dport = usin->sin6_port; + + dccp_set_state(sk, DCCP_REQUESTING); + err = inet6_hash_connect(&dccp_death_row, sk); + if (err) + goto late_failure; + /* FIXME */ +#if 0 + dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); +#endif + err = dccp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + dccp_set_state(sk, DCCP_CLOSED); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + __u64 seq; + + sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport, + &hdr->saddr, dh->dccph_sport, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + dccp_sync_mss(sk, dst_mtu(dst)); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + seq = DCCP_SKB_CB(skb)->dccpd_seq; + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, + &hdr->daddr, &hdr->saddr, + inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + /* + * Wake people up to see the error + * (see connect in sock.c) + */ + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr *final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = ireq6->iif; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.osrcrt == 2 && + ireq6->pktopts) { + struct sk_buff *pktopts = ireq6->pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(pktopts->nh.raw + + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + struct dccp_hdr *dh = dccp_hdr(skb); + dh->dccph_checksum = dccp_v6_check(dh, skb->len, + &ireq6->loc_addr, + &ireq6->rmt_addr, + csum_partial((char *)dh, + skb->len, + skb->csum)); + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void dccp_v6_reqsk_destructor(struct request_sock *req) +{ + if (inet6_rsk(req)->pktopts != NULL) + kfree_skb(inet6_rsk(req)->pktopts); +} + +static struct request_sock_ops dccp6_request_sock_ops = { + .family = AF_INET6, + .obj_size = sizeof(struct dccp6_request_sock), + .rtx_syn_ack = dccp_v6_send_response, + .send_ack = dccp_v6_reqsk_send_ack, + .destructor = dccp_v6_reqsk_destructor, + .send_reset = dccp_v6_ctl_send_reset, +}; + +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr, + len, IPPROTO_DCCP, + csum_partial((char *)dh, + dh->dccph_doff << 2, + skb->csum)); +} + +static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb) +{ + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct flowi fl; + u64 seqno; + + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (!ipv6_unicast_destination(rxskb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Swap the send and the receive. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*dh), IPPROTO_DCCP, + skb->csum); + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb) +{ + struct flowi fl; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + + /* FIXME: calculate checksum, IPv4 also should... */ + + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) +{ + dccp_v6_ctl_send_ack(skb); +} + +static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct ipv6hdr *iph = skb->nh.ipv6h; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet6_csk_search_req(sk, &prev, + dh->dccph_sport, + &iph->saddr, + &iph->daddr, + inet6_iif(skb)); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(&dccp_hashinfo, + &iph->saddr, dh->dccph_sport, + &iph->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + struct inet6_request_sock *ireq6; + struct ipv6_pinfo *np = inet6_sk(sk); + const __u32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (dccp_bad_service_code(sk, service)) { + reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * There are no SYN attacks on IPv6, yet... + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq6 = inet6_rsk(req); + ireq = inet_rsk(req); + ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ + ireq6->pktopts = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + ireq6->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v6_init_sequence(sk, skb); + dreq->dreq_service = service; + + if (dccp_v6_send_response(sk, req, NULL)) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + dcb->dccpd_reset_code = reset_code; + return -1; +} + +static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct dccp6_sock *newdp6; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + if (newsk == NULL) + return NULL; + + newdp6 = (struct dccp6_sock *)newsk; + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; + newsk->sk_backlog_rcv = dccp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, dccp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.osrcrt == 2 && + opt == NULL && ireq6->pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw + + rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, dccp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newdp = dccp_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newsk->sk_bound_dev_if = ireq6->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (ireq6->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC); + kfree_skb(ireq6->pktopts); + ireq6->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + dccp_sync_mss(newsk, dst_mtu(dst)); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __inet6_hash(&dccp_hashinfo, newsk); + inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, dccp_rcv_established and rcv_established + handle them correctly, but it is not case with + dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + } + + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + +reset: + dccp_v6_ctl_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +} + +static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + const struct dccp_hdr *dh; + struct sk_buff *skb = *pskb; + struct sock *sk; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + if (dccp_packet_without_ack(skb)) + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + else + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr, + dh->dccph_sport, + &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) + goto no_dccp_socket; + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + return sk_receive_skb(sk, skb) ? -1 : 0; + +no_dccp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v6_ctl_send_reset(skb); + } +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; +} + +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { + .queue_xmit = inet6_csk_xmit, + .send_check = dccp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * DCCP over IPv4 via INET6 API + */ +static struct inet_connection_sock_af_ops dccp_ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int dccp_v6_init_sock(struct sock *sk) +{ + int err = dccp_v4_init_sock(sk); + + if (err == 0) + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + + return err; +} + +static int dccp_v6_destroy_sock(struct sock *sk) +{ + dccp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +static struct proto dccp_v6_prot = { + .name = "DCCPv6", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v6_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v6_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v6_do_rcv, + .hash = dccp_v6_hash, + .unhash = dccp_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v6_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v6_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp6_sock), + .rsk_prot = &dccp6_request_sock_ops, + .twsk_prot = &dccp6_timewait_sock_ops, +}; + +static struct inet6_protocol dccp_v6_protocol = { + .handler = dccp_v6_rcv, + .err_handler = dccp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct proto_ops inet6_dccp_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = dccp_poll, + .ioctl = inet6_ioctl, + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw dccp_v6_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v6_prot, + .ops = &inet6_dccp_ops, + .capability = -1, + .flags = INET_PROTOSW_ICSK, +}; + +static int __init dccp_v6_init(void) +{ + int err = proto_register(&dccp_v6_prot, 1); + + if (err != 0) + goto out; + + err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_unregister_proto; + + inet6_register_protosw(&dccp_v6_protosw); +out: + return err; +out_unregister_proto: + proto_unregister(&dccp_v6_prot); + goto out; +} + +static void __exit dccp_v6_exit(void) +{ + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); + proto_unregister(&dccp_v6_prot); +} + +module_init(dccp_v6_init); +module_exit(dccp_v6_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); +MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h new file mode 100644 index 000000000000..e4d4e9309270 --- /dev/null +++ b/net/dccp/ipv6.h @@ -0,0 +1,37 @@ +#ifndef _DCCP_IPV6_H +#define _DCCP_IPV6_H +/* + * net/dccp/ipv6.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/ipv6.h> + +struct dccp6_sock { + struct dccp_sock dccp; + /* + * ipv6_pinfo has to be the last member of dccp6_sock, + * see inet6_sk_generic. + */ + struct ipv6_pinfo inet6; +}; + +struct dccp6_request_sock { + struct dccp_request_sock dccp; + struct inet6_request_sock inet6; +}; + +struct dccp6_timewait_sock { + struct inet_timewait_sock inet; + struct inet6_timewait_sock tw6; +}; + +#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1393461898bb..29261fc198e7 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = { (unsigned long)&dccp_death_row), }; +EXPORT_SYMBOL_GPL(dccp_death_row); + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); @@ -170,6 +183,8 @@ out_free: return newsk; } +EXPORT_SYMBOL_GPL(dccp_create_openreq_child); + /* * Process an incoming packet for RESPOND sockets represented * as an request_sock. @@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; } - child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; @@ -236,6 +251,8 @@ drop: goto out; } +EXPORT_SYMBOL_GPL(dccp_check_req); + /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with @@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child, sock_put(child); return ret; } + +EXPORT_SYMBOL_GPL(dccp_child_process); diff --git a/net/dccp/output.c b/net/dccp/output.c index 74ff87025878..efd7ffb903a1 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/inet_sock.h> #include <net/sock.h> #include "ackvec.h" @@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); struct dccp_hdr *dh; @@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) break; } - dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, - inet->daddr); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (set_ack) dccp_event_ack_sent(sk); @@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - err = ip_queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (err <= 0) return err; @@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { + struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now; - - /* - * FIXME: we really should be using the af_specific thing to support - * IPv6. - * mss_now = pmtu - tp->af_specific->net_header_len - - * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); - */ - mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - - sizeof(struct dccp_hdr_ext); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ - mss_now -= dp->dccps_ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ - dp->dccps_pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; dp->dccps_mss_cache = mss_now; return mss_now; } +EXPORT_SYMBOL_GPL(dccp_sync_mss); + void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); @@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - if (inet_sk_rebuild_header(sk) != 0) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) return -EHOSTUNREACH; /* Routing failure or similar. */ return dccp_transmit_skb(sk, (skb_cloned(skb) ? @@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, return skb; } +EXPORT_SYMBOL_GPL(dccp_make_response); + struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, const enum dccp_reset_codes code) @@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, */ static inline void dccp_connect_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* - * FIXME: set dp->{dccps_swh,dccps_swl}, with - * something like dccp_inc_seq - */ + dccp_update_gss(sk, dp->dccps_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); icsk->icsk_retransmits = 0; } @@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_connect); + void dccp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8a6b2a9e4581..65b11ea90d85 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,7 +24,7 @@ #include <net/checksum.h> #include <net/inet_common.h> -#include <net/ip.h> +#include <net/inet_sock.h> #include <net/protocol.h> #include <net/sock.h> #include <net/xfrm.h> @@ -34,15 +34,18 @@ #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> -#include <linux/dccp.h> #include "ccid.h" #include "dccp.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; +EXPORT_SYMBOL_GPL(dccp_statistics); + atomic_t dccp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(dccp_orphan_count); + static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, @@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags) return err; } +EXPORT_SYMBOL_GPL(dccp_disconnect); + /* * Wait for a DCCP event. * @@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -static unsigned int dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) +unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; @@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL_GPL(dccp_poll); + int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { dccp_pr_debug("entry\n"); return -ENOIOCTLCMD; } +EXPORT_SYMBOL_GPL(dccp_ioctl); + static int dccp_setsockopt_service(struct sock *sk, const u32 service, char __user *optval, int optlen) { @@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, int val; if (level != SOL_DCCP) - return ip_setsockopt(sk, level, optname, optval, optlen); + return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, + optname, optval, + optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL_GPL(dccp_setsockopt); + static int dccp_getsockopt_service(struct sock *sk, int len, u32 __user *optval, int __user *optlen) @@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, int val, len; if (level != SOL_DCCP) - return ip_getsockopt(sk, level, optname, optval, optlen); - + return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, + optname, optval, + optlen); if (get_user(len, optlen)) return -EFAULT; @@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, return 0; } +EXPORT_SYMBOL_GPL(dccp_getsockopt); + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -410,6 +426,8 @@ out_discard: goto out_release; } +EXPORT_SYMBOL_GPL(dccp_sendmsg); + int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -507,7 +525,9 @@ out: return len; } -static int inet_dccp_listen(struct socket *sock, int backlog) +EXPORT_SYMBOL_GPL(dccp_recvmsg); + +int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; @@ -543,6 +563,8 @@ out: return err; } +EXPORT_SYMBOL_GPL(inet_dccp_listen); + static const unsigned char dccp_new_state[] = { /* current state: new state: action: */ [0] = DCCP_CLOSED, @@ -648,12 +670,16 @@ adjudge_to_death: sock_put(sk); } +EXPORT_SYMBOL_GPL(dccp_close); + void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("entry\n"); } -static struct proto_ops inet_dccp_ops = { +EXPORT_SYMBOL_GPL(dccp_shutdown); + +static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops; static struct inet_protosw dccp_v4_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, + .prot = &dccp_prot, .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, - .flags = 0, + .flags = INET_PROTOSW_ICSK, }; /* @@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); int dccp_debug; module_param(dccp_debug, int, 0444); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; int ehash_order, bhash_order, i; - int rc = proto_register(&dccp_v4_prot, 1); + int rc = proto_register(&dccp_prot, 1); if (rc) goto out; @@ -869,7 +897,7 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_hashinfo.bind_bucket_cachep = NULL; out_proto_unregister: - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); goto out; } @@ -892,7 +920,7 @@ static void __exit dccp_fini(void) get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); } module_init(dccp_init); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d402e9020c68..78ec5344be86 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk); #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) -static struct proto_ops dn_proto_ops; +static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; @@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } @@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops dn_proto_ops = { +static const struct proto_ops dn_proto_ops = { .family = AF_DECnet, .owner = THIS_MODULE, .release = dn_release, diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 8d0cc3cf3e49..33ab256cfd4a 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb) } } - if (!dn_db->router) { - dn_db->router = neigh_clone(neigh); - } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) - neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + /* Only use routers in our area */ + if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) { + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } } write_unlock(&neigh->lock); neigh_release(neigh); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 369f25b60f3f..44bda85e678f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) got_it: if (sk != NULL) { struct dn_scp *scp = DN_SK(sk); - int ret; /* Reset backoff */ scp->nsp_rxtshift = 0; @@ -807,21 +806,7 @@ got_it: goto free_out; } - bh_lock_sock(sk); - ret = NET_RX_SUCCESS; - if (decnet_debug_level & 8) - printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", - (int)cb->rt_flags, (int)cb->nsp_flags, - (int)cb->src_port, (int)cb->dst_port, - !!sock_owned_by_user(sk)); - if (!sock_owned_by_user(sk)) - ret = dn_nsp_backlog_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + return sk_receive_skb(sk, skb); } return dn_nsp_no_socket(skb, reason); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 34fdac51df96..c792994d7952 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include <linux/if_arp.h> #include <linux/wireless.h> #include <linux/skbuff.h> +#include <linux/udp.h> #include <net/sock.h> #include <net/inet_common.h> #include <linux/stat.h> @@ -45,7 +46,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -static struct proto_ops econet_ops; +static const struct proto_ops econet_ops; static struct hlist_head econet_sklist; static DEFINE_RWLOCK(econet_lock); @@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256]; #define EC_PORT_IP 0xd2 #ifdef CONFIG_ECONET_AUNUDP -static spinlock_t aun_queue_lock; +static DEFINE_SPINLOCK(aun_queue_lock); static struct socket *udpsock; #define AUN_PORT 0x8000 @@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg break; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } /*NOTREACHED*/ return 0; @@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { .family = PF_ECONET, .owner = THIS_MODULE, .release = econet_release, diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 03efaacbdb73..4cc6f41c6930 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, return 1; } - if ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : - ieee->host_decrypt) { + if (is_multicast_ether_addr(hdr->addr1) + ? ieee->host_mc_decrypt : ieee->host_decrypt) { int idx = 0; if (skb->len >= hdrlen + 3) idx = skb->data[hdrlen + 3] >> 6; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e55136ae09f4..011cca7ae02b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -456,6 +456,14 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default m + ---help--- + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f0435d00db6b..c54edd76de09 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf249000..966a071a408c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -93,6 +93,7 @@ #include <linux/smp_lock.h> #include <linux/inet.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> @@ -302,6 +303,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -775,16 +777,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = devinet_ioctl(cmd, (void __user *)arg); break; default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == - -ENOIOCTLCMD) - err = dev_ioctl(cmd, (void __user *)arg); + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; break; } return err; } -struct proto_ops inet_stream_ops = { +const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -805,7 +807,7 @@ struct proto_ops inet_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet_dgram_ops = { +const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -830,7 +832,7 @@ struct proto_ops inet_dgram_ops = { * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ -static struct proto_ops inet_sockraw_ops = { +static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -869,7 +871,8 @@ static struct inet_protosw inetsw_array[] = .ops = &inet_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }, { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c9e1ba..aed537fa2c88 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -6,6 +6,7 @@ #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <net/icmp.h> +#include <net/protocol.h> #include <asm/scatterlist.h> diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748f02d7..37432088fe6d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -86,6 +86,7 @@ #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3e95a2..7b9bb28e2ee9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -58,6 +58,7 @@ #endif #include <linux/kmod.h> +#include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce66e7b7..73bfcae8af9c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -9,6 +9,7 @@ #include <linux/pfkeyv2.h> #include <linux/random.h> #include <net/icmp.h> +#include <net/protocol.h> #include <net/udp.h> /* decapsulation data for use when post-processing */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b984d687..18f5e509281a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -30,6 +30,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209cb169..e2890ec8159e 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bbc1518..0dd4d06e456d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -33,6 +33,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac070e3..ef4724de7350 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> @@ -36,6 +37,7 @@ #include <linux/netlink.h> #include <linux/init.h> +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 705e3ce86df9..e320b32373e5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -41,6 +41,13 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Substantial contributions to this work comes from: + * + * David S. Miller, <davem@davemloft.net> + * Stephen Hemminger <shemminger@osdl.org> + * Paul E. McKenney <paulmck@us.ibm.com> + * Patrick McHardy <kaber@trash.net> */ #define VERSION "0.404" @@ -59,6 +66,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2ad4d2..be5a519cd2f8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -73,6 +73,7 @@ #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c724f01..34758118c10c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,8 @@ #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> + +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f1a566..ae20281d8deb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke return node != NULL; } +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -125,7 +130,7 @@ tb_found: goto success; } else { ret = 1; - if (inet_csk_bind_conflict(sk, tb)) + if (bind_conflict(sk, tb)) goto fail_unlock; } } @@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout) + unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; @@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39061ed53cfd..c49908192047 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); + &tw6->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); + &tw6->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); + &inet6_rsk(req)->loc_addr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); + &inet6_rsk(req)->rmt_addr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.saddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : + inet6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; entry.daddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : + inet6_rsk(req)->rmt_addr.s6_addr32 : #endif &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe736d2..33228115cda4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,12 +15,14 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#include <net/ip.h> /* * Allocate and initialize a new local port bind bucket. @@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hash = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + inet_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet_hash(hinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row);; + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet_hash(hinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a68811..417f126c749e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, - SLAB_ATOMIC); + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd38924f..ce5fe3f74a3d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2ee48e..ce2b70ce4018 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include <linux/compiler.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -38,6 +39,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> +#include <net/inetpeer.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -89,8 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2b2e0d..e45846ae570b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -128,6 +128,7 @@ #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da8d8b3..d3f6c468faf4 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -22,6 +22,7 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> +#include <net/route.h> /* * Write options to IP header, record destination address to diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2bd397..2a830de3a699 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d87257309..6986e11d65cc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -25,12 +25,12 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/tcp.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> @@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; - if (sk->sk_type == SOCK_STREAM) { - struct tcp_sock *tp = tcp_sk(sk); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & @@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, inet->daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) - tp->ext_header_len -= inet->opt->optlen; + icsk->icsk_ext_hdr_len -= inet->opt->optlen; if (opt) - tp->ext_header_len += opt->optlen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df17b40..d64e2ec8da7b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -28,6 +28,7 @@ #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> +#include <net/protocol.h> struct ipcomp_tfms { struct list_head list; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674baaa8d9..bb3613ec448c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -42,6 +42,7 @@ #include <linux/in.h> #include <linux/if.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> @@ -58,6 +59,7 @@ #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> +#include <net/route.h> #include <asm/uaccess.h> #include <net/checksum.h> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb507c9..caa3b7d2e48a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -52,6 +52,7 @@ #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <net/route.h> #include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d7eb680101c2..9b176a942ac5 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app) } -#if 0000 -/* - * Get reference to app by name (called from user context) - */ -struct ip_vs_app *ip_vs_app_get_by_name(char *appname) -{ - struct ip_vs_app *app, *a = NULL; - - down(&__ip_vs_app_mutex); - - list_for_each_entry(ent, &ip_vs_app_list, a_list) { - if (strcmp(app->name, appname)) - continue; - - /* softirq may call ip_vs_app_get too, so the caller - must disable softirq on the current CPU */ - if (ip_vs_app_get(app)) - a = app; - break; - } - - up(&__ip_vs_app_mutex); - - return a; -} -#endif - - /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c59c655..81d90354c928 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -24,7 +24,10 @@ * */ +#include <linux/in.h> +#include <linux/net.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/seq_file.h> @@ -219,7 +222,7 @@ struct ip_vs_conn *ip_vs_conn_in_get if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -254,7 +257,7 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -295,7 +298,7 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -391,8 +394,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -430,8 +434,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -571,7 +576,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_hash(cp); expire_later: - IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1a0843cd58a9..1aca94a9fd8b 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31b760e..c935c5086d33 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,7 @@ #include <linux/netfilter_ipv4.h> #include <net/ip.h> +#include <net/route.h> #include <net/sock.h> #include <asm/uaccess.h> @@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", fwmark, ip_vs_proto_name(protocol), NIPQUAD(vaddr), ntohs(vport), svc?"hit":"not hit"); @@ -597,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "refcnt=%d\n", + "dest->refcnt=%d\n", dest->vfwmark, NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); @@ -804,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) dest = ip_vs_trash_get_dest(svc, daddr, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, @@ -949,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " + "dest->refcnt=%d\n", NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320dce93..9fee19c4c617 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -37,8 +37,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2fc1fa1..e7004741ac73 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -13,7 +13,10 @@ * Changes: * */ +#include <linux/config.h> #include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/slab.h> #include <linux/types.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 561cda326fa8..6e5cb92a5c83 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -41,8 +41,10 @@ * me to write this module. */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. - * returns bool success. - */ -static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, - struct ip_vs_lblc_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblc_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index ce456dbf09a5..32ba37ba72d8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -39,8 +39,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. - * returns bool success. - */ -static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, - struct ip_vs_lblcr_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblcr_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a0bbd7..8b0505b09317 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7c7e8e..c36ccf057a19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 0e878fd6215c..bc28b1160a3a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = 2*HZ, }; - -#if 0 - -/* FIXME: This is going to die */ - -static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 10*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 100*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -#endif - static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", @@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_dest *dest = cp->dest; IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", pp->name, (state_off==TCP_DIR_OUTPUT)?"output ":"input ", th->syn? 'S' : '.', diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e0aefa..89d9175d8f28 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -15,8 +15,11 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/kernel.h> #include <linux/netfilter_ipv4.h> +#include <linux/udp.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e44a39..7775e6cc68be 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -34,8 +34,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3d8062..1bca714bda3d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,12 +21,14 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/inetdevice.h> #include <linux/net.h> #include <linux/completion.h> #include <linux/delay.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/igmp.h> /* for ip_mc_join_group */ +#include <linux/udp.h> #include <net/ip.h> #include <net/sock.h> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e9639bba6..bba156304695 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -68,19 +68,14 @@ struct arpt_table_info { unsigned int initial_entries; unsigned int hook_entry[NF_ARP_NUMHOOKS]; unsigned int underflow[NF_ARP_NUMHOOKS]; - char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); + void *entries[NR_CPUS]; }; static LIST_HEAD(arpt_target); static LIST_HEAD(arpt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, char *hdr_addr, int len) { @@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, outdev = out ? out->name : nulldevname; read_lock_bh(&table->lock); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, - smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); back = get_entry(table_base, table->private->underflow[hook]); @@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +static int mark_source_chains(struct arpt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e - = (struct arpt_entry *)(newinfo->entries + pos); + = (struct arpt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali goto next; e = (struct arpt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct arpt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali newpos = pos + e->next_offset; } e = (struct arpt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, struct arpt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -710,11 +705,11 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -743,29 +738,26 @@ static int translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) { + if (!mark_source_chains(newinfo, valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e, return 0; } +static inline int set_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct arpt_table_info *t, struct arpt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + ARPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size, struct arpt_entry *e; struct arpt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ struct arpt_entry_target *t; - e = (struct arpt_entry *)(table->private->entries + off); + e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } +static void free_table_info(struct arpt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct arpt_table_info *alloc_table_info(unsigned int size) +{ + struct arpt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { int ret; @@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len) struct arpt_table *t; struct arpt_table_info *newinfo, *oldinfo; struct arpt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct arpt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); up(&arpt_mutex); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len) struct arpt_counters_info tmp, *paddc; struct arpt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len) } i = 0; - ARPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table, struct arpt_table_info *newinfo; static struct arpt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; return ret; } - memcpy(newinfo->entries, repl->entries, repl->size); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&arpt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void arpt_unregister_table(struct arpt_table *table) { + void *loc_cpu_entry; + down(&arpt_mutex); LIST_DELETE(&arpt_tables, table); up(&arpt_mutex); /* Decrease module usage counts and free resources */ - ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847fa10f5..0366eedb4d70 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -18,11 +18,13 @@ * */ +#include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/ip.h> #include <linux/moduleparam.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9d377a..57956dee60c8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -31,6 +31,7 @@ #include <linux/ip.h> #include <linux/in.h> #include <linux/list.h> +#include <linux/seq_file.h> static DEFINE_RWLOCK(ip_ct_gre_lock); #define ASSERT_READ_LOCK(x) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7c7660..46becbe4fe58 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -11,6 +11,7 @@ #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <net/checksum.h> diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b191f4b..a88bcc551244 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -27,6 +27,7 @@ #endif #include <net/checksum.h> #include <net/ip.h> +#include <net/route.h> #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed40b47..4f95d477805c 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -44,6 +44,7 @@ * */ #include <linux/config.h> +#include <linux/in.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -53,6 +54,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> #include <linux/ip.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> #include <asm/uaccess.h> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8475e8..2a26d167e149 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -105,20 +100,15 @@ struct ipt_table_info unsigned int underflow[NF_IP_NUMHOOKS]; /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ipt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -855,6 +845,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -875,11 +866,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -907,27 +898,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -943,15 +931,12 @@ replace_table(struct ipt_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ipt_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ipt_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ipt_table_info *t, struct ipt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry *e; struct ipt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct ipt_counters) * table->private->number; - counters = vmalloc(countersize); + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries, return ret; } +static void free_table_info(struct ipt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ipt_table_info *alloc_table_info(unsigned int size) +{ + struct ipt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); + if (newinfo->entries[cpu] == 0) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len) struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ipt_mutex); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len) struct ipt_counters_info tmp, *paddc; struct ipt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) struct ipt_table_info *newinfo; static struct ipt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu + * but dont care of preemption + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ipt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ipt_unregister_table(struct ipt_table *table) { + void *loc_cpu_entry; + down(&ipt_mutex); LIST_DELETE(&ipt_tables, table); up(&ipt_mutex); /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174c6fe6..27860510ca6d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,6 +11,7 @@ #include <linux/config.h> #include <linux/types.h> +#include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/timer.h> #include <linux/module.h> @@ -18,6 +19,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/checksum.h> +#include <net/route.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_tables.h> diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c index 1a53924041fc..03f554857a4d 100644 --- a/net/ipv4/netfilter/ipt_physdev.c +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -9,6 +9,7 @@ */ #include <linux/module.h> +#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ipt_physdev.h> #include <linux/netfilter_ipv4/ip_tables.h> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc668db46..39d49dc333a7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -38,6 +38,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> +#include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/sock.h> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60ea48a1..e20be3331f67 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; - child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) inet_csk_reqsk_queue_add(sk, req, child); else diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a02b48b..16984d4a8a06 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/sysctl.h> #include <linux/config.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -22,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -614,6 +616,15 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14ac56d..00aa80e93243 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int err = 0; if (level != SOL_TCP) - return tp->af_specific->setsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { @@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); - info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; @@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int val, len; if (level != SOL_TCP) - return tp->af_specific->getsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); if (get_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 1d0cd86621b1..035f2092d73a 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -30,8 +30,6 @@ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int low_utilization_threshold = 153; -static int low_utilization_period = 2; static int initial_ssthresh = 100; static int smooth_part = 20; @@ -43,10 +41,6 @@ module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); -module_param(low_utilization_threshold, int, 0644); -MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); -module_param(low_utilization_period, int, 0644); -MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); @@ -60,11 +54,6 @@ struct bictcp { u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ - u32 delay_min; /* min delay */ - u32 delay_max; /* max delay */ - u32 last_delay; - u8 low_utilization;/* 0: high; 1: low */ - u32 low_utilization_start; /* starting time of low utilization detection*/ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ @@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; - ca->delay_min = 0; - ca->delay_max = 0; - ca->last_delay = 0; - ca->low_utilization = 0; - ca->low_utilization_start = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } @@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if ( ca->loss_cwnd == 0 || - (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->loss_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } - -/* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct sock *sk, int flag) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 dist, delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - tcp_time_stamp < ca->epoch_start + HZ || - /* this delay samples may not be accurate */ - flag == 0) { - ca->last_delay = 0; - goto notlow; - } - - delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ - ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - if (delay == 0) /* no previous delay sample */ - goto notlow; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - ca->delay_min = ca->delay_max = delay; - goto notlow; - } - - if (ca->delay_max < delay) - ca->delay_max = delay; - - /* utilization is low, if avg delay < dist*threshold - for checking_period time */ - dist = ca->delay_max - ca->delay_min; - if (dist <= ca->delay_min>>6 || - tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) - goto notlow; - - if (ca->low_utilization_start == 0) { - ca->low_utilization = 0; - ca->low_utilization_start = tcp_time_stamp; - } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) - > low_utilization_period*HZ) { - ca->low_utilization = 1; - } - - return; - - notlow: - ca->low_utilization = 0; - ca->low_utilization_start = 0; - -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(sk, data_acked); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ - /* in case of wrong delay_max*/ - if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) - ca->delay_max = ca->delay_min - + ((ca->delay_max - ca->delay_min)* 90) / 100; - /* Wmax and fast convergence */ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) @@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state) bictcp_reset(inet_csk_ca(sk)); } -/* Track delayed acknowledgement ratio using sliding window +/* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, u32 cnt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c8dc12..e688c687d62d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + /* * TCP Reno congestion control * This is special case used for fallback as well. diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 000000000000..31a4986dfbf7 --- /dev/null +++ b/net/ipv4/tcp_cubic.c @@ -0,0 +1,411 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * + * This is from the implementation of CUBIC TCP in + * Injong Rhee, Lisong Xu. + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant + * in PFLDnet 2005 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> +#include <asm/div64.h> + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh = 100; +static int bic_scale = 41; +static int tcp_friendliness = 1; + +static u32 cube_rtt_scale; +static u32 beta_scale; +static u64 cube_factor; + +/* Note parameters that are used for precomputing scale factors are read-only */ +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(beta, int, 0444); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(bic_scale, int, 0444); +MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); + +#include <asm/div64.h> + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} + +/* + * calculate the cubic root of x using Newton-Raphson + */ +static u32 cubic_root(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * cbrt(x) = exp(log(x) / 3) + */ + x = 1u << (fls64(a)/3); + + /* + * Iteration based on: + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + do { + x1 = x; + x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + } while (abs(x1 - x) > 1); + + return x; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + u64 offs; + u32 delta, t, bic_target, min_cnt, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to bictcp_HZ */ + t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + if (ca->delay_min > 0) { + /* max increment = Smax * rtt / 0.1 */ + min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); + if (ca->cnt < min_cnt) + ca->cnt = min_cnt; + } + + /* slow start and low utilization */ + if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ + ca->cnt = 50; + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Keep track of minimum rtt */ +static inline void measure_delay(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (data_acked) + measure_delay(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops cubictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + + cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by bic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, bic_scale * 10); + + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e23086bce..0a461232329f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && @@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); } + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on @@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); @@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static inline u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(const struct sk_buff *skb) { struct timeval tv, now; @@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, if (nwin > tp->max_window) { tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } @@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr)>>2) { tp->rx_opt.saw_tstamp = 0; @@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } -static __inline__ int -tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) return 0; } -static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) @@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); @@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; @@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) { /* If the user specified a specific send buffer setting, do * not modify it. @@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) sk->sk_write_space(sk); } -static inline void tcp_check_space(struct sock *sk) +static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { tcp_push_pending_frames(sk, tp); tcp_check_space(sk); @@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ @@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) return result; } -static __inline__ int -tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete_user(sk, skb); @@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { - struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! @@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_set_state(sk, TCP_ESTABLISHED); /* Make sure socket is routed, for correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - icsk = inet_csk(sk); - if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -4173,7 +4210,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb) < 0) + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Make sure socket is routed, for * correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e1929b..e9f83e5b28ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -69,6 +69,7 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/timewait_sock.h> #include <net/xfrm.h> #include <linux/inet.h> @@ -86,8 +87,7 @@ int sysctl_tcp_low_latency; /* Socket used for sending RSTs */ static struct socket *tcp_socket; -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb); +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, @@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - return inet_csk_get_port(&tcp_hashinfo, sk, snum); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void tcp_v4_hash(struct sock *sk) @@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } -/* called with local bh disabled */ -static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { - struct inet_sock *inet = inet_sk(sk); - u32 daddr = inet->rcv_saddr; - u32 saddr = inet->daddr; - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - /* With PAWS, it is safe from the viewpoint - of data integrity. Even without PAWS it - is safe provided sequence spaces do not - overlap i.e. at data rates <= 80Mbit/sec. - - Actually, the idea is close to VJ's one, - only timestamp cache is held not per host, - but per port pair and TW bucket is used - as state holder. + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); - If TW bucket has been already destroyed we - fall back to VJ's scheme and use initial - timestamp retrieved from peer table. - */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. -unique: - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; } return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; } -static inline u32 connect_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - - return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); -} - -/* - * Bind a port for a connect operation and hash it. - */ -static inline int tcp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + connect_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v4_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet_hash(&tcp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row);; - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(&tcp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} +EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - tp->ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) - tp->ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; @@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v4_hash_connect(sk); + err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; @@ -433,12 +270,10 @@ failure: /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, - u32 mtu) +static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's @@ -644,10 +479,10 @@ out: } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); @@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static inline void syn_flood_warning(struct sk_buff *skb) +#ifdef CONFIG_SYN_COOKIES +static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb) ntohs(skb->h.th->dest)); } } +#endif /* * Save and compile IPv4 options into the request_sock if needed. */ -static inline struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; @@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = { .send_reset = tcp_v4_send_reset, }; +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) - newtp->ext_header_len = newinet->opt->optlen; + inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1314,16 +1156,6 @@ do_time_wait: goto discard_it; } -static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; -} - /* VJ's idea. Save last timestamp seen from this destination * and hold it at least for normal timewait interval to use for duplicate * segment detection in subsequent connections, before they enter synchronized @@ -1382,7 +1214,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct tcp_func ipv4_specific = { +struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1392,7 +1224,7 @@ struct tcp_func ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = v4_addr2sockaddr, + .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; @@ -1433,7 +1265,8 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1989,7 +1822,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2ac4321..2b9b7f6c7f7c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,18 +274,18 @@ kill: void tcp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6; - ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; } #endif @@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock **prev) { struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0b406a..a7623ead39a8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; -static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) @@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static inline void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); @@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static __inline__ u16 tcp_select_window(struct sock *sk) +static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); @@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) return new_win; } +static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, + __u32 tstamp) +{ + if (tp->rx_opt.tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->rx_opt.ts_recent); + } + if (tp->rx_opt.eff_sacks) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks--; + } + } +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, + __u32 ts_recent) +{ + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +} /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial @@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ECN_send(sk, tp, skb, tcp_header_size); } - tp->af_specific->send_check(sk, th, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); - err = tp->af_specific->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (unlikely(err <= 0)) return err; @@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) It is minimum of user_mss and mss received with SYN. It also does not include TCP options. - tp->pmtu_cookie is last pmtu, seen by this function. + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, @@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; - + struct inet_connection_sock *icsk = inet_csk(sk); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ - tp->pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now; @@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (dst) { u32 mtu = dst_mtu(dst); - if (mtu != tp->pmtu_cookie) + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = 65535 - - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; + xmit_size_goal = (65535 - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) @@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out = tp->packets_out; @@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if(tp->af_specific->rebuild_header(sk)) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a @@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* * Do all connect socket setups that can be done AF independent. */ -static inline void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13e7e6e8df16..3b7403495052 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2422a5f7195d..223abaa72bc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -86,6 +86,7 @@ #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> +#include <linux/igmp.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> @@ -846,20 +847,7 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS); - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (noblock) return -EAGAIN; @@ -1094,7 +1082,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, +static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { if (uh->check == 0) { @@ -1108,7 +1096,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - return 0; } /* @@ -1141,8 +1128,7 @@ int udp_rcv(struct sk_buff *skb) if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) - goto csum_error; + udp_checksum_init(skb, uh, ulen, saddr, daddr); if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6460eec834b7..9601fd7f9d66 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o netfilter.o + ip6_flowlabel.o ipv6_syms.o netfilter.o \ + inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a60585fd85ad..704fb73e6c5f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device * int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d9546380fa04..68afc53be662 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,6 +167,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -389,6 +390,8 @@ int inet6_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + /* * This does both peername and sockname. */ @@ -431,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; - int err = -EINVAL; switch(cmd) { @@ -450,16 +452,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr((void __user *) arg); default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) - return(dev_ioctl(cmd,(void __user *) arg)); - return err; + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return(0); } -struct proto_ops inet6_stream_ops = { +const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -480,7 +481,7 @@ struct proto_ops inet6_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet6_dgram_ops = { +const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -508,7 +509,7 @@ static struct net_proto_family inet6_family_ops = { }; /* Same as inet6_dgram_ops, sans udp_poll. */ -static struct proto_ops inet6_sockraw_ops = { +static const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -609,6 +610,79 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +int inet6_sk_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f3629730eb15..13cc7f895583 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <net/xfrm.h> #include <asm/scatterlist.h> diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bfbe9970793..6de8ee1a5ad9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <linux/random.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/icmpv6.h> static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index be6faf311387..113374dc342c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) return opt; } +EXPORT_SYMBOL_GPL(ipv6_invert_rthdr); + /********************************** Hop-by-hop options. **********************************/ @@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) return opt2; } +EXPORT_SYMBOL_GPL(ipv6_dup_options); + static int ipv6_renew_option(void *ohdr, struct ipv6_opt_hdr __user *newopt, int newoptlen, int inherit, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 000000000000..792f90f0f9ec --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,199 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> + +#include <net/addrconf.h> +#include <net/inet_connection_sock.h> +#include <net/inet_ecn.h> +#include <net/inet_hashtables.h> +#include <net/ip6_route.h> +#include <net/sock.h> + +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 a = raddr->s6_addr32[0]; + u32 b = raddr->s6_addr32[1]; + u32 c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32)rport; + __jhash_mix(a, b, c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet6_request_sock *treq = inet6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 01d5f46d4e40..4154f3a8b6cf 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -5,7 +5,8 @@ * * Generic INET6 transport hashtables * - * Authors: Lotsa people, from code originally in tcp + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo <acme@mandriva.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,12 +15,13 @@ */ #include <linux/config.h> - #include <linux/module.h> +#include <linux/random.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> +#include <net/ip.h> struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, @@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, } EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, + inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; + const int range = high - low; + int i, port; + static u32 hint; + const u32 offset = hint + inet6_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(hinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet6_hash(hinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet6_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1cf02765fb5c..89d12b4817a9 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) return NULL; } +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8523c76ebf76..b4c4beba0ede 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -775,6 +775,8 @@ out_err_release: return err; } +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 55917fb17094..626dd39685f2 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3620718defe6..c63868dd2ca2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); @@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -380,14 +381,15 @@ sticky_done: goto done; update: retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f829a4ad3ccc..1cf305a9f8dd 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; - mc_lst->sflock = RW_LOCK_UNLOCKED; + rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d469271c4d..ea43ef1d94a7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -15,6 +15,7 @@ * - new extension header parser code */ #include <linux/config.h> +#include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> @@ -86,11 +87,6 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -108,20 +104,15 @@ struct ip6t_table_info unsigned int underflow[NF_IP6_NUMHOOKS]; /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ip6t_target); static LIST_HEAD(ip6t_match); static LIST_HEAD(ip6t_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -376,8 +367,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -649,7 +639,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ip6t_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +649,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +699,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +722,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -941,6 +932,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ip6t_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +953,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,27 +985,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -1029,15 +1018,12 @@ replace_table(struct ip6t_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ip6t_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ip6t_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -1072,16 +1058,44 @@ add_entry_to_counter(const struct ip6t_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ip6t_table_info *t, struct ip6t_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1098,6 +1112,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry *e; struct ip6t_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1109,13 +1124,13 @@ copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +1142,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1196,6 +1211,46 @@ get_entries(const struct ip6t_get_entries *entries, return ret; } +static void free_table_info(struct ip6t_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ip6t_table_info *alloc_table_info(unsigned int size) +{ + struct ip6t_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1204,6 +1259,7 @@ do_replace(void __user *user, unsigned int len) struct ip6t_table *t; struct ip6t_table_info *newinfo, *oldinfo; struct ip6t_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1212,13 +1268,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1229,10 +1285,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1271,8 +1326,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1284,11 +1340,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ip6t_mutex); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1321,6 +1377,7 @@ do_add_counters(void __user *user, unsigned int len) struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1350,7 +1407,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1543,28 +1602,29 @@ int ip6t_register_table(struct ip6t_table *table, struct ip6t_table_info *newinfo; static struct ip6t_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ip6t_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1593,20 +1653,23 @@ int ip6t_register_table(struct ip6t_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ip6t_unregister_table(struct ip6t_table *table) { + void *loc_cpu_entry; + down(&ip6t_mutex); LIST_DELETE(&ip6t_tables, table); up(&ip6t_mutex); /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0cd1d1bd9033..ae4653bfd654 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/ip.h> #include <linux/spinlock.h> #include <linux/icmpv6.h> diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index dde37793d20b..268918d5deea 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c index 24bc0cde43a1..65937de1b58c 100644 --- a/net/ipv6/netfilter/ip6t_esp.c +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c2c52af9e560..f3e5ffbd592f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -98,7 +98,7 @@ struct nf_ct_frag6_queue #define FRAG6Q_HASHSZ 64 static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; -static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(nf_ct_frag6_lock); static u32 nf_ct_frag6_hash_rnd; static LIST_HEAD(nf_ct_frag6_lru_list); int nf_ct_frag6_nqueues = 0; @@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct init_timer(&fq->timer); fq->timer.function = nf_ct_frag6_expire; fq->timer.data = (long) fq; - fq->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); return nf_ct_frag6_intern(hash, fq); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a66900cda2af..66f1d12ea578 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -32,6 +32,7 @@ #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <asm/ioctls.h> #include <asm/bug.h> @@ -433,25 +434,14 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } + skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; /* FIXME: increment a raw6 drops counter here */ - goto out_free; + goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8827389abaf7..2947bc56d8a0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -48,6 +48,7 @@ #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> +#include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -59,6 +60,7 @@ #include <net/addrconf.h> #include <net/snmp.h> #include <net/dsfield.h> +#include <net/timewait_sock.h> #include <asm/uaccess.h> @@ -67,224 +69,33 @@ static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); -static struct tcp_func ipv6_mapped; -static struct tcp_func ipv6_specific; +static struct inet_connection_sock_af_ops ipv6_mapped; +static struct inet_connection_sock_af_ops ipv6_specific; -static inline int tcp_v6_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) -{ - const struct sock *sk2; - const struct hlist_node *node; - - /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; - } - - return node != NULL; -} - -/* Grrr, addr_type already calculated by caller, but I don't want - * to add some silly "cookie" argument to this method just for that. - * But it doesn't matter, the recalculation is in the rarest path - * this function ever takes. - */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - struct hlist_node *node; - int ret; - - local_bh_disable(); - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - - do { - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. */ - snum = rover; - } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (tb && !hlist_empty(&tb->owners)) { - if (tb->fastreuse > 0 && sk->sk_reuse && - sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_v6_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (tb == NULL) { - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); - if (tb == NULL) - goto fail_unlock; - } - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - -static __inline__ void __tcp_v6_hash(struct sock *sk) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - inet_listen_wlock(&tcp_hashinfo); - } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (tcp_hashinfo.ehash_size - 1); - list = &tcp_hashinfo.ehash[hash].chain; - lock = &tcp_hashinfo.ehash[hash].lock; - write_lock(lock); - } - - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet6_csk_bind_conflict); } - static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->af_specific == &ipv6_mapped) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { tcp_prot.hash(sk); return; } local_bh_disable(); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); local_bh_enable(); } } -/* - * Open request hash tables. - */ - -static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) -{ - u32 a, b, c; - - a = raddr->s6_addr32[0]; - b = raddr->s6_addr32[1]; - c = raddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += raddr->s6_addr32[3]; - b += (u32) rport; - __jhash_mix(a, b, c); - - return c & (TCP_SYNQ_HSIZE - 1); -} - -static struct request_sock *tcp_v6_search_req(const struct sock *sk, - struct request_sock ***prevp, - __u16 rport, - struct in6_addr *raddr, - struct in6_addr *laddr, - int iif) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); - - if (inet_rsk(req)->rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); - *prevp = prev; - return req; - } - } - - return NULL; -} - static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -308,195 +119,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - if (tcptw->tw_ts_recent_stamp && - (!twp || - (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { - /* See comment in tcp_ipv4.c */ - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (!tp->write_seq) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 tcpv6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int tcp_v6_hash_connect(struct sock *sk) -{ - unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + tcpv6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __tcp_v6_hash(sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v6_hash(sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; @@ -571,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = tp->ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -583,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - tp->af_specific = &ipv6_mapped; + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - tp->ext_header_len = exthdrlen; - tp->af_specific = &ipv6_specific; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -643,16 +271,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tp->ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v6_hash_connect(sk); + err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; @@ -758,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (tp->pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ @@ -775,8 +404,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -822,7 +451,7 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = NULL; @@ -888,8 +517,8 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (tcp6_rsk(req)->pktopts) - kfree_skb(tcp6_rsk(req)->pktopts); + if (inet6_rsk(req)->pktopts) + kfree_skb(inet6_rsk(req)->pktopts); } static struct request_sock_ops tcp6_request_sock_ops = { @@ -901,26 +530,15 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; -static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - - if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) || - (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || - ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; - } - return 0; -} - +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); @@ -1091,8 +709,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->source, + &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1116,23 +735,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp6_request_sock *treq; + struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); @@ -1157,7 +765,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp6_request_sock_ops); + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) goto drop; @@ -1170,7 +778,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = tcp6_rsk(req); + treq = inet6_rsk(req); ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); TCP_ECN_create_request(req, skb->h.th); @@ -1196,8 +804,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v6_send_synack(sk, req, NULL)) goto drop; - tcp_v6_synq_add(sk, req); - + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop: @@ -1212,7 +819,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; @@ -1247,7 +854,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); - newtp->af_specific = &ipv6_mapped; + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; @@ -1261,10 +868,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 af_tcp.af_specific. + worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - tcp_sync_mss(newsk, newtp->pmtu_cookie); + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -1371,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, sock_kfree_s(sk, opt, opt->tot_len); } - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newtp->ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); @@ -1382,7 +989,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; - __tcp_v6_hash(newsk); + __inet6_hash(&tcp_hashinfo, newsk); inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1679,139 +1286,16 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk) -{ - int err; - struct dst_entry *dst; - struct ipv6_pinfo *np = inet6_sk(sk); - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { - sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - return 0; -} - -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) -{ - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; - struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); - - if (err) { - sk->sk_err_soft = -err; - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - skb->dst = dst_clone(dst); - - /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - - return ip6_xmit(sk, skb, &fl, np->opt, 0); -} - -static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; -} - static int tcp_v6_remember_stamp(struct sock *sk) { /* Alas, not yet... */ return 0; } -static struct tcp_func ipv6_specific = { - .queue_xmit = tcp_v6_xmit, +static struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, - .rebuild_header = tcp_v6_rebuild_header, + .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v6_remember_stamp, @@ -1819,7 +1303,7 @@ static struct tcp_func ipv6_specific = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1827,7 +1311,7 @@ static struct tcp_func ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct tcp_func ipv6_mapped = { +static struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1838,7 +1322,7 @@ static struct tcp_func ipv6_mapped = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1877,8 +1361,9 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; - tp->af_specific = &ipv6_specific; + icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1900,14 +1385,13 @@ static int tcp_v6_destroy_sock(struct sock *sk) static void get_openreq6(struct seq_file *seq, struct sock *sk, struct request_sock *req, int i, int uid) { - struct in6_addr *dest, *src; int ttd = req->expires - jiffies; + struct in6_addr *src = &inet6_rsk(req)->loc_addr; + struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; if (ttd < 0) ttd = 0; - src = &tcp6_rsk(req)->loc_addr; - dest = &tcp6_rsk(req)->rmt_addr; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", @@ -1988,14 +1472,14 @@ static void get_timewait6_sock(struct seq_file *seq, { struct in6_addr *dest, *src; __u16 destp, srcp; - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tcp6tw->tw_v6_daddr; - src = &tcp6tw->tw_v6_rcv_saddr; + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2093,7 +1577,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, }; @@ -2110,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = { .ops = &inet6_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }; void __init tcpv6_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5cc8731eb55b..d8538dcea813 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -36,6 +36,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <net/sock.h> @@ -300,20 +301,7 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (flags & MSG_DONTWAIT) { UDP6_INC_STATS_USER(UDP_MIB_INERRORS); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 34b3bb868409..0dc519b40404 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink; static struct datalink_proto *p8023_datalink; static struct datalink_proto *pSNAP_datalink; -static struct proto_ops ipx_dgram_ops; +static const struct proto_ops ipx_dgram_ops; LIST_HEAD(ipx_interfaces); DEFINE_SPINLOCK(ipx_interfaces_lock); @@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EINVAL; break; default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { .family = PF_IPX, .owner = THIS_MODULE, .release = ipx_release, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 6f92f9c62990..fbfa96754417 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -62,12 +62,12 @@ static int irda_create(struct socket *sock, int protocol); -static struct proto_ops irda_stream_ops; -static struct proto_ops irda_seqpacket_ops; -static struct proto_ops irda_dgram_ops; +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops irda_ultra_ops; +static const struct proto_ops irda_ultra_ops; #define ULTRA_MAX_DATA 382 #endif /* CONFIG_IRDA_ULTRA */ @@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) - ret = sock_error(sk); + ret = sock_error(sk); + if (ret) + break; else if (sk->sk_shutdown & RCV_SHUTDOWN) ; else if (noblock) @@ -1821,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -EINVAL; default: IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); - return dev_ioctl(cmd, (void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ @@ -2463,7 +2464,7 @@ static struct net_proto_family irda_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2484,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2505,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2527,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { }; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, diff --git a/net/key/af_key.c b/net/key/af_key.c index 39031684b65c..52efd04cbedb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void) } -static struct proto_ops pfkey_ops; +static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { @@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = { [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), + [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ @@ -383,6 +384,55 @@ static int verify_address_len(void *p) return 0; } +static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx) +{ + int len = 0; + + len += sizeof(struct sadb_x_sec_ctx); + len += sec_ctx->sadb_x_ctx_len; + len += sizeof(uint64_t) - 1; + len /= sizeof(uint64_t); + + return len; +} + +static inline int verify_sec_ctx_len(void *p) +{ + struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p; + int len; + + if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE) + return -EINVAL; + + len = pfkey_sec_ctx_len(sec_ctx); + + if (sec_ctx->sadb_x_sec_len != len) + return -EINVAL; + + return 0; +} + +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx) +{ + struct xfrm_user_sec_ctx *uctx = NULL; + int ctx_size = sec_ctx->sadb_x_ctx_len; + + uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + + if (!uctx) + return NULL; + + uctx->len = pfkey_sec_ctx_len(sec_ctx); + uctx->exttype = sec_ctx->sadb_x_sec_exttype; + uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; + uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; + uctx->ctx_len = sec_ctx->sadb_x_ctx_len; + memcpy(uctx + 1, sec_ctx + 1, + uctx->ctx_len); + + return uctx; +} + static int present_and_same_family(struct sadb_address *src, struct sadb_address *dst) { @@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h if (verify_address_len(p)) return -EINVAL; } + if (ext_type == SADB_X_EXT_SEC_CTX) { + if (verify_sec_ctx_len(p)) + return -EINVAL; + } ext_hdrs[ext_type-1] = p; } p += ext_len; @@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + /* identity & sensitivity */ if ((x->props.family == AF_INET && @@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, n_port->sadb_x_nat_t_port_reserved = 0; } + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + return skb; } @@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, struct sadb_lifetime *lifetime; struct sadb_sa *sa; struct sadb_key *key; + struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; @@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + goto out; + + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + + if (err) + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; if (sa->sadb_sa_auth) { int keysize = 0; @@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) return 0; } +static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + + if (xfrm_ctx) { + int len = sizeof(struct sadb_x_sec_ctx); + len += xfrm_ctx->ctx_len; + return PFKEY_ALIGN8(len); + } + return 0; +} + static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) { int sockaddr_size = pfkey_sockaddr_size(xp->family); @@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + - (socklen * 2))); + (socklen * 2))) + + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) @@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i } } } + + /* security context */ + if ((xfrm_ctx = xp->security)) { + int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); + + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size); + sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); } @@ -1976,12 +2099,13 @@ out: static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) { - int err; + int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (xp->selector.dport) xp->selector.dport_mask = ~0; + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) { + err = -ENOBUFS; + goto out; + } + + err = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (err) + goto out; + } + xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; @@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); - if (err) { - kfree(xp); - return err; - } + + if (err) + goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; @@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h return 0; out: + security_xfrm_policy_free(xp); kfree(xp); return err; } @@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg int err; struct sadb_address *sa; struct sadb_x_policy *pol; - struct xfrm_policy *xp; + struct xfrm_policy *xp, tmp; struct xfrm_selector sel; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg if (sel.dport) sel.dport_mask = ~0; - xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + memset(&tmp, 0, sizeof(struct xfrm_policy)); + + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + return -ENOMEM; + + err = security_xfrm_policy_alloc(&tmp, uctx); + kfree(uctx); + + if (err) + return err; + } + + xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1); + security_xfrm_policy_free(&tmp); if (xp == NULL) return -ENOENT; @@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, { struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + struct sadb_x_sec_ctx *sec_ctx; switch (family) { case AF_INET: @@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; + /* security context too */ + if (len >= (pol->sadb_x_policy_len*8 + + sizeof(struct sadb_x_sec_ctx))) { + char *p = (char *)pol; + struct xfrm_user_sec_ctx *uctx; + + p += pol->sadb_x_policy_len*8; + sec_ctx = (struct sadb_x_sec_ctx *)p; + if (len < pol->sadb_x_policy_len*8 + + sec_ctx->sadb_x_sec_len) + goto out; + if ((*dir = verify_sec_ctx_len(p))) + goto out; + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + *dir = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (*dir) + goto out; + } + *dir = pol->sadb_x_policy_dir-1; return xp; out: + security_xfrm_policy_free(xp); kfree(xp); return NULL; } @@ -2946,7 +3127,7 @@ out: return err; } -static struct proto_ops pfkey_ops = { +static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c3f0b0783453..8171c53bc0ed 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -36,7 +36,7 @@ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; -static struct proto_ops llc_ui_ops; +static const struct proto_ops llc_ui_ops; static int llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); @@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo) /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) { - rc = sock_error(sk); + rc = sock_error(sk); + if (rc) break; - } rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -960,7 +959,7 @@ out: static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } /** @@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops llc_ui_ops = { +static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cba63729313d..e10512e229b6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; INIT_HLIST_NODE(&inst->hlist); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f28460b61e47..55afdda3d940 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid) atomic_set(&inst->id_sequence, 0); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96020d7087e8..7849cac14d3a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) return 0; } -static struct proto_ops netlink_ops; +static const struct proto_ops netlink_ops; static int netlink_insert(struct sock *sk, u32 pid) { @@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 287cfcc56951..3b1378498d50 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -441,7 +441,7 @@ errout: } static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, - int seq, int cmd) + int seq, u8 cmd) { struct sk_buff *skb; int err; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e5d82d711cae..63b0e4afeb33 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,7 +63,7 @@ static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); -static struct proto_ops nr_proto_ops; +static const struct proto_ops nr_proto_ops; /* * Socket removal during an interrupt is now safe. @@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; int ret; - lock_sock(sk); switch (cmd) { case TIOCOUTQ: { long amount; + + lock_sock(sk); amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amount < 0) amount = 0; @@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: { struct sk_buff *skb; long amount = 0L; + + lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; @@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCGSTAMP: + lock_sock(sk); ret = sock_get_timestamp(sk, argp); release_sock(sk); return ret; @@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: - release_sock(sk); return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: - release_sock(sk); if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: - release_sock(sk); - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } - release_sock(sk); return 0; } @@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops nr_proto_ops = { +static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, diff --git a/net/nonet.c b/net/nonet.c index e5241dceaa57..1230f0ae832e 100644 --- a/net/nonet.c +++ b/net/nonet.c @@ -14,11 +14,6 @@ #include <linux/init.h> #include <linux/kernel.h> -void __init sock_init(void) -{ - printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n"); -} - static int sock_no_open(struct inode *irrelevant, struct file *dontcare) { return -ENXIO; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3e2462760413..f69e5ed9bd06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk) } -static struct proto_ops packet_ops; +static const struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt; +static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, #endif default: - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } return 0; } @@ -1784,7 +1784,7 @@ out: #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt = { +static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, @@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = { }; #endif -static struct proto_ops packet_ops = { +static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 829fdbc4400b..63090be2315a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } return 0; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82fb07aa06a5..ba5283204837 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,7 +25,7 @@ #include <net/pkt_sched.h> -#define VERSION "1.1" +#define VERSION "1.2" /* Network Emulation Queuing algorithm. ==================================== @@ -65,11 +65,12 @@ struct netem_sched_data { u32 jitter; u32 duplicate; u32 reorder; + u32 corrupt; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor, reorder_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; @@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. + */ + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) + || (skb->ip_summed == CHECKSUM_HW + && skb_checksum_help(skb, 0))) { + sch->qstats.drops++; + return NET_XMIT_DROP; + } + + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + if (q->gap == 0 /* not doing reordering */ || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { @@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); + return 0; +} + +/* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); if (ret) return ret; } - } + if (tb[TCA_NETEM_CORRUPT-1]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); + if (ret) + return ret; + } + } return 0; } @@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) reorder.correlation = q->reorder_cor.rho; RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rta->rta_len = skb->tail - b; return skb->len; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6cf0342706b5..c4a2a8c4c339 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dec68a604773..9d05e13e92f6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) * 1000; - asoc->pmtu = 0; asoc->frag_point = 0; /* Set the association max_retrans and RTO values from the @@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->overall_error_count = 0; + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ @@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = - SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; @@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, sctp_transport_set_owner(peer, asoc); + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + /* Initialize the pmtu of the transport. */ - sctp_transport_pmtu(peer); + if (peer->param_flags & SPP_PMTUD_ENABLE) + sctp_transport_pmtu(peer); + else if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pmtu) - asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); else - asoc->pmtu = peer->pmtu; + asoc->pathmtu = peer->pathmtu; SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " - "%d\n", asoc, asoc->pmtu); + "%d\n", asoc, asoc->pathmtu); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. @@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * (for example, implementations MAY use the size of the * receiver advertised window). */ - peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set @@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->partial_bytes_acked = 0; peer->flight_size = 0; - /* By default, enable heartbeat for peer address. */ - peer->hb_allowed = 1; - - /* Initialize the peer's heartbeat interval based on the - * sock configured value. - */ - peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); - - /* Set the path max_retrans. */ - peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; - /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; @@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each(pos, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); - if (!pmtu || (t->pmtu < pmtu)) - pmtu = t->pmtu; + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; } if (pmtu) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); - asoc->pmtu = pmtu; + asoc->pathmtu = pmtu; asoc->frag_point = sctp_frag_point(sp, pmtu); } SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", - __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); + __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point); } /* Should we send a SACK to update our peer? */ @@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc) case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= - min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu))) return 1; break; default: diff --git a/net/sctp/input.c b/net/sctp/input.c index b24ff2c1aef5..238f1bffa684 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", __FUNCTION__, pmtu, - SCTP_DEFAULT_MINSEGMENT); - pmtu = SCTP_DEFAULT_MINSEGMENT; - } + if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu)) + return; - if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { - t->pmtu = pmtu; + if (t->param_flags & SPP_PMTUD_ENABLE) { + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", + __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + t->param_flags = (t->param_flags & ~SPP_HB) | + SPP_PMTUD_DISABLE; + } else { + t->pathmtu = pmtu; + } + + /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); - sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } /* diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fa3be2b8fb5f..15c05165c905 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 2; } -static struct proto_ops inet6_seqpacket_ops = { +static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/sctp/output.c b/net/sctp/output.c index 931371633464..a40991ef72c9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pmtu) : - (packet->transport->pmtu)); + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); too_big = (psize + chunk_len > pmtu); @@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (!dst || (dst->obsolete > 1)) { dst_release(dst); sctp_transport_route(tp, NULL, sctp_sk(sk)); - sctp_assoc_sync_pmtu(asoc); + if (asoc->param_flags & SPP_PMTUD_ENABLE) { + sctp_assoc_sync_pmtu(asoc); + } } nskb->dst = dst_clone(tp->dst); @@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); - (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + if (tp->param_flags & SPP_PMTUD_ENABLE) + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + else + (*tp->af_specific->sctp_xmit)(nskb, tp, 1); out: packet->size = packet->overhead; @@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * if ((flightsize + Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ - max_burst_bytes = asoc->max_burst * asoc->pmtu; + max_burst_bytes = asoc->max_burst * asoc->pathmtu; if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { transport->cwnd = transport->flight_size + max_burst_bytes; SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " @@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * data will fit or delay in hopes of bundling a full * sized packet. */ - if (len < asoc->pmtu - packet->overhead) { + if (len < asoc->pathmtu - packet->overhead) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f775d78aa59d..de693b43c8ea 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -54,6 +54,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> @@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = { }; /* Socket operations. */ -static struct proto_ops inet_seqpacket_ops = { +static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 823947170a33..2d7d8a5db2ac 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, { __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; int error = 0; - if (force) + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, if (!asoc->peer.sack_needed) { /* We will need a SACK for the next packet. */ asoc->peer.sack_needed = 1; - goto out; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + else + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { if (asoc->a_rwnd > asoc->rwnd) asoc->a_rwnd = asoc->rwnd; @@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } -out: + return error; nomem: error = -ENOMEM; @@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE && - (transport->error_count++ >= transport->max_retrans)) { + (transport->error_count++ >= transport->pathmaxrxt)) { SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", " transport IP: port:%d failed.\n", asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 475bfb4972d9..557a7d90b92a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, * HEARTBEAT is sent (see Section 8.3). */ - if (transport->hb_allowed) { + if (transport->param_flags & SPP_HB_ENABLE) { if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, arg, commands)) @@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_DISCARD; } - max_interval = link->hb_interval + link->rto; + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ if (time_after(hbinfo->sent_at, jiffies) || @@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, * document allow. However, an SCTP transmitter MUST NOT be * more aggressive than the following algorithms allow. */ - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } - return SCTP_DISPOSITION_CONSUME; discard_force: @@ -2721,13 +2716,9 @@ discard_force: return SCTP_DISPOSITION_DISCARD; discard_noforce: - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } return SCTP_DISPOSITION_DISCARD; consume: return SCTP_DISPOSITION_CONSUME; @@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, * send another. */ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); return SCTP_DISPOSITION_CONSUME; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9df888e932c5..fc04d185fa33 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1941,107 +1941,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ +int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + if (params->spp_hbinterval) { + if (trans) { + trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + if (params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(sp, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + if (params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + static int sctp_setsockopt_peer_addr_params(struct sock *sk, char __user *optval, int optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); int error; + int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return -EINVAL; + return - EINVAL; + if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; - /* - * API 7. Socket Options (setting the default value for the endpoint) - * All options that support specific settings on an association by - * filling in either an association id variable or a sockaddr_storage - * SHOULD also support setting of the same value for the entire endpoint - * (i.e. future associations). To accomplish this the following logic is - * used when setting one of these options: - - * c) If neither the sockaddr_storage or association identification is - * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and - * the association identification is 0, the settings are a default - * and to be applied to the endpoint (all future associations). - */ + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu + && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; - /* update default value for endpoint (all future associations) */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - /* Manual heartbeat on an endpoint is invalid. */ - if (0xffffffff == params.spp_hbinterval) + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) return -EINVAL; - else if (params.spp_hbinterval) - sctp_sk(sk)->paddrparam.spp_hbinterval = - params.spp_hbinterval; - if (params.spp_pathmaxrxt) - sctp_sk(sk)->paddrparam.spp_pathmaxrxt = - params.spp_pathmaxrxt; - return 0; } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - /* Applications can enable or disable heartbeats for any peer address - * of an association, modify an address's heartbeat interval, force a - * heartbeat to be sent immediately, and adjust the address's maximum - * number of retransmissions sent before an address is considered - * unreachable. - * - * The value of the heartbeat interval, in milliseconds. A value of - * UINT32_MAX (4294967295), when modifying the parameter, specifies - * that a heartbeat should be sent immediately to the peer address, - * and the current interval should remain unchanged. + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. */ - if (0xffffffff == params.spp_hbinterval) { - error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); - if (error) - return error; - } else { - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. */ - if (params.spp_hbinterval) { - trans->hb_allowed = 1; - trans->hb_interval = - msecs_to_jiffies(params.spp_hbinterval); - } else - trans->hb_allowed = 0; + if (!trans && asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - if (params.spp_pathmaxrxt) - trans->max_retrans = params.spp_pathmaxrxt; + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_setsockopt_delayed_ack_time(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_assoc_value params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate value parameter. */ + if (params.assoc_value > 500) + return -EINVAL; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.assoc_value) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.assoc_value); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.assoc_value; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } else { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (params.assoc_value) { + trans->sackdelay = + msecs_to_jiffies(params.assoc_value); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + } + return 0; } @@ -2334,7 +2606,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl /* Update the frag_point of the existing associations. */ list_for_each(pos, &(sp->ep->asocs)) { asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); } return 0; @@ -2491,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); + break; + case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); break; @@ -2715,8 +2991,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ - sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); - sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + sp->hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. @@ -2865,7 +3146,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); - status.sstat_primary.spinfo_mtu = transport->pmtu; + status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; @@ -2924,7 +3205,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); - pinfo.spinfo_mtu = transport->pmtu; + pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; @@ -3086,69 +3367,227 @@ out: * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); if (len != sizeof(struct sctp_paddrparams)) return -EINVAL; + if (copy_from_user(¶ms, optval, len)) return -EFAULT; - /* If no association id is specified retrieve the default value - * for the endpoint that will be used for all future associations + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; - params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; - - goto done; + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); return -EINVAL; + } - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. - */ - if (!trans->hb_allowed) - params.spp_hbinterval = 0; - else - params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - params.spp_pathmaxrxt = trans->max_retrans; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = jiffies_to_msecs( + asoc->sackdelay); + else + params.assoc_value = 0; + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = sp->sackdelay; + else + params.assoc_value = 0; + } -done: if (copy_to_user(optval, ¶ms, len)) return -EFAULT; @@ -4015,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_getsockopt_delayed_ack_time(sk, len, optval, + optlen); + break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 268ddaf2dc0f..68d73e2dd155 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->init_sent_count = 0; peer->state = SCTP_ACTIVE; - peer->hb_allowed = 0; + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + peer->hbinterval = 0; /* Initialize the default path max_retrans. */ - peer->max_retrans = sctp_max_retrans_path; + peer->pathmaxrxt = sctp_max_retrans_path; peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); @@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport) dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); dst_release(dst); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Caches the dst entry and source address for a transport's destination @@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport, af->get_saddr(asoc, dst, daddr, &transport->saddr); transport->dst = dst; + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport, opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Hold a reference to a transport. */ @@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; - pmtu = transport->asoc->pmtu; + pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less @@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); - transport->cwnd = transport->asoc->pmtu; + 4*transport->asoc->pathmtu); + transport->cwnd = transport->asoc->pathmtu; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if ((jiffies - transport->last_time_ecne_reduced) > transport->rtt) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, */ if ((jiffies - transport->last_time_used) > transport->rto) transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); break; }; @@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, unsigned long sctp_transport_timeout(struct sctp_transport *t) { unsigned long timeout; - timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout = t->hbinterval + t->rto + sctp_jitter(t->rto); timeout += jiffies; return timeout; } diff --git a/net/socket.c b/net/socket.c index 3145103cdf54..06fa217f58a9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb) kfree(iocb->private); } -/* - * Read data from a socket. ubuf is a user mode pointer. We make sure the user - * area ubuf...ubuf+size-1 is writable before asking the protocol. - */ - -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, - size_t size, loff_t pos) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { - struct sock_iocb *x, siocb; struct socket *sock; int flags; - if (pos != 0) - return -ESPIPE; - if (size==0) /* Match SYS5 behaviour */ - return 0; + sock = file->private_data; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + char __user *ubuf, size_t size, struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; iocb->ki_dtor = sock_aio_dtor; } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_iov.iov_base = ubuf; - x->async_iov.iov_len = size; - flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + siocb->kiocb = iocb; + siocb->async_iov.iov_base = ubuf; + siocb->async_iov.iov_len = size; - return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); + iocb->private = siocb; + return siocb; } +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; -/* - * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 - * is readable by the user process. - */ + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, - size_t size, loff_t pos) + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_readv(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct sock_iocb *x, siocb; - struct socket *sock; - + struct kiocb iocb; + struct sock_iocb siocb; + struct msghdr msg; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + + ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; + if (pos != 0) return -ESPIPE; - if(size==0) /* Match SYS5 behaviour */ + if (count == 0) /* Match SYS5 behaviour */ return 0; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; - iocb->ki_dtor = sock_aio_dtor; - } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (sock->type == SOCK_SEQPACKET) - x->async_msg.msg_flags |= MSG_EOR; - x->async_iov.iov_base = (void __user *)ubuf; - x->async_iov.iov_len = size; - - return __sock_sendmsg(iocb, sock, &x->async_msg, size); + x = alloc_sock_iocb(iocb, ubuf, count, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) { - struct socket *sock; - int flags; + struct socket *sock = file->private_data; + size_t size = 0; + int i; - sock = file->private_data; + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; - flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (more) - flags |= MSG_MORE; + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; - return sock->ops->sendpage(sock, page, offset, size, flags); + return __sock_sendmsg(iocb, sock, msg, size); } -static int sock_readv_writev(int type, - struct file * file, const struct iovec * iov, - long count, size_t size) +static ssize_t sock_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct msghdr msg; - struct socket *sock; + struct kiocb iocb; + struct sock_iocb siocb; + int ret; - sock = file->private_data; + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = count; - msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} - /* read() does a VERIFY_WRITE */ - if (type == VERIFY_WRITE) - return sock_recvmsg(sock, &msg, size, msg.msg_flags); +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; - if (sock->type == SOCK_SEQPACKET) - msg.msg_flags |= MSG_EOR; + if (pos != 0) + return -ESPIPE; + if (count == 0) /* Match SYS5 behaviour */ + return 0; - return sock_sendmsg(sock, &msg, size); -} + x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb); + if (!x) + return -ENOMEM; -static ssize_t sock_readv(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_WRITE, - file, vector, count, tot_len); -} - -static ssize_t sock_writev(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_READ, - file, vector, count, tot_len); + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } @@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; default: err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(cmd, argp); break; } return err; @@ -2036,7 +2039,7 @@ int sock_unregister(int family) return 0; } -void __init sock_init(void) +static int __init sock_init(void) { /* * Initialize sock SLAB cache. @@ -2044,12 +2047,10 @@ void __init sock_init(void) sk_init(); -#ifdef SLAB_SKB /* * Initialize skbuff SLAB cache */ skb_init(); -#endif /* * Initialize the protocols module. @@ -2058,15 +2059,19 @@ void __init sock_init(void) init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); - /* The real protocol initialization is performed when - * do_initcalls is run. + + /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER netfilter_init(); #endif + + return 0; } +core_initcall(sock_init); /* early initcall */ + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c6a51911e71e..d68eba481291 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_serv *serv = svsk->sk_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; - struct proto_ops *ops; + const struct proto_ops *ops; struct svc_sock *newsvsk; int err, slen; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index acc73ba8bade..5f6ae79b8b16 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -121,7 +121,7 @@ int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ @@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, @@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, { struct sock *s; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); s = __unix_find_socket_byname(sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; @@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock) u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); + spin_lock_init(&u->lock); atomic_set(&u->inflight, sock ? 0 : -1); init_MUTEX(&u->readsem); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); @@ -642,12 +642,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,7 +658,7 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; out: up(&u->readsem); @@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; @@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: up(&u->readsem); out: @@ -1063,10 +1063,12 @@ restart: /* Set credentials */ sk->sk_peercred = other->sk_peercred; - sock_hold(newsk); - unix_peer(sk) = newsk; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; unix_state_wunlock(sk); @@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } else { sunaddr = NULL; err = -ENOTCONN; - other = unix_peer_get(sk); + other = unix_peer(sk); if (!other) goto out_err; } @@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, other->sk_data_ready(other, size); sent+=size; } - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; @@ -1491,8 +1492,6 @@ pipe_err: send_sig(SIGPIPE,current,0); err = -EPIPE; out_err: - if (other) - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; @@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } return err; @@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); } @@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void unix_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ffc64e1712d..411802bd4d37 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -182,7 +182,7 @@ void unix_gc(void) if (down_trylock(&unix_gc_sem)) return; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); forall_unix_sockets(i, s) { @@ -301,7 +301,7 @@ void unix_gc(void) } u->gc_tree = GC_ORPHAN; } - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* * Here we are. Hitlist is filled. Die. diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 59fec59b2132..7a43ae4721ed 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -181,7 +181,7 @@ struct wanpipe_opt #endif static int sk_count; -extern struct proto_ops wanpipe_ops; +extern const struct proto_ops wanpipe_ops; static unsigned long find_free_critical; static void wanpipe_unlink_driver(struct sock *sk); @@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar #endif default: - return dev_ioctl(cmd,(void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ } @@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr return 0; } -struct proto_ops wanpipe_ops = { +const struct proto_ops wanpipe_ops = { .family = PF_WANPIPE, .owner = THIS_MODULE, .release = wanpipe_release, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 020d73cc8414..16459c7f54b2 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); -static struct proto_ops x25_proto_ops; +static const struct proto_ops x25_proto_ops; static struct x25_address null_x25_address = {" "}; @@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d19e274b9c4a..64a447375fdb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -10,7 +10,7 @@ * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor - * + * */ #include <asm/bug.h> @@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer)) BUG(); + security_xfrm_policy_free(policy); kfree(policy); } EXPORT_SYMBOL(__xfrm_policy_destroy); @@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { - if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 && + xfrm_sec_ctx_match(pol->security, policy->security)) { if (excl) { write_unlock_bh(&xfrm_policy_lock); return -EEXIST; @@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete) +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete) { struct xfrm_policy *pol, **p; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { - if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) && + (xfrm_sec_ctx_match(ctx, pol->security))) { xfrm_pol_hold(pol); if (delete) *p = pol->next; @@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, } return pol; } -EXPORT_SYMBOL(xfrm_policy_bysel); +EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) { @@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk); /* Find policy to apply to this flow. */ -static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, +static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; @@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, continue; match = xfrm_selector_match(sel, fl, family); + if (match) { - xfrm_pol_hold(pol); - break; + if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) { + xfrm_pol_hold(pol); + break; + } } } read_unlock_bh(&xfrm_policy_lock); @@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, *obj_refp = &pol->refcnt; } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid) { struct xfrm_policy *pol; read_lock_bh(&xfrm_policy_lock); if ((pol = sk->sk_policy[dir]) != NULL) { - int match = xfrm_selector_match(&pol->selector, fl, + int match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); + int err = 0; + if (match) + err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir)); + + if (match && !err) xfrm_pol_hold(pol); else pol = NULL; @@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) if (newp) { newp->selector = old->selector; + if (security_xfrm_policy_clone(old, newp)) { + kfree(newp); + return NULL; /* ENOMEM */ + } newp->lft = old->lft; newp->curlft = old->curlft; newp->action = old->action; @@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } -static inline int policy_to_flow_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - }; -} static int stale_bundle(struct dst_entry *dst); @@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, int err; u32 genid; u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u32 sk_sid = security_sk_sid(sk, fl, dir); restart: genid = atomic_read(&flow_cache_genid); policy = NULL; if (sk && sk->sk_policy[1]) - policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid); if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) return 0; - policy = flow_cache_lookup(fl, family, - policy_to_flow_dir(XFRM_POLICY_OUT), + policy = flow_cache_lookup(fl, sk_sid, family, dir, xfrm_policy_lookup); } @@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, { struct xfrm_policy *pol; struct flowi fl; + u8 fl_dir = policy_to_flow_dir(dir); + u32 sk_sid; if (_decode_session(skb, &fl, family) < 0) return 0; + sk_sid = security_sk_sid(sk, &fl, fl_dir); + /* First, check used SA against their selectors. */ if (skb->sp) { int i; for (i=skb->sp->len-1; i>=0; i--) { - struct sec_decap_state *xvec = &(skb->sp->x[i]); + struct sec_decap_state *xvec = &(skb->sp->x[i]); if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) return 0; @@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; if (sk && sk->sk_policy[dir]) - pol = xfrm_sk_policy_lookup(sk, dir, &fl); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid); if (!pol) - pol = flow_cache_lookup(&fl, family, - policy_to_flow_dir(dir), + pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir, xfrm_policy_lookup); if (!pol) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 479effc97666..e12d0be5f976 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -10,7 +10,7 @@ * Split up af-specific functions * Derek Atkins <derek@ihtfp.com> * Add UDP Encapsulation - * + * */ #include <linux/workqueue.h> @@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + security_xfrm_state_free(x); kfree(x); } @@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, selector. */ if (x->km.state == XFRM_STATE_VALID) { - if (!xfrm_selector_match(&x->sel, fl, family)) + if (!xfrm_selector_match(&x->sel, fl, family) || + !xfrm_sec_ctx_match(pol->security, x->security)) continue; if (!best || best->km.dying > x->km.dying || @@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, acquire_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, family)) + if (xfrm_selector_match(&x->sel, fl, family) && + xfrm_sec_ctx_match(pol->security, x->security)) error = -ESRCH; } } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cdd9a07e043..92e2b804c606 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -7,7 +7,7 @@ * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support - * + * */ #include <linux/module.h> @@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma) return 0; } + +static inline int verify_sec_ctx_len(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1]; + struct xfrm_user_sec_ctx *uctx; + int len = 0; + + if (!rt) + return 0; + + if (rt->rta_len < sizeof(*uctx)) + return -EINVAL; + + uctx = RTA_DATA(rt); + + if (uctx->ctx_len > PAGE_SIZE) + return -EINVAL; + + len += sizeof(struct xfrm_user_sec_ctx); + len += uctx->ctx_len; + + if (uctx->len != len) + return -EINVAL; + + return 0; +} + + static int verify_newsa_info(struct xfrm_usersa_info *p, struct rtattr **xfrma) { @@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_encap_tmpl(xfrma))) goto out; + if ((err = verify_sec_ctx_len(xfrma))) + goto out; err = -EINVAL; switch (p->mode) { @@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a return 0; } + +static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + int len = 0; + + if (xfrm_ctx) { + len += sizeof(struct xfrm_user_sec_ctx); + len += xfrm_ctx->ctx_len; + } + return len; +} + +static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg) +{ + struct xfrm_user_sec_ctx *uctx; + + if (!u_arg) + return 0; + + uctx = RTA_DATA(u_arg); + return security_xfrm_state_alloc(x, uctx); +} + static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memcpy(&x->id, &p->id, sizeof(x->id)); @@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, if (err) goto error; + if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1]))) + goto error; + x->km.seq = p->seq; return x; @@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) int err; struct km_event c; - err = verify_newsa_info(p, (struct rtattr **) xfrma); + err = verify_newsa_info(p, (struct rtattr **)xfrma); if (err) return err; - x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err); if (!x) return err; @@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) if (x->encap) RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + if (x->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + x->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = x->security->ctx_doi; + uctx->ctx_alg = x->security->ctx_alg; + uctx->ctx_len = x->security->ctx_len; + memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len); + } nlh->nlmsg_len = skb->tail - b; out: sp->this_idx++; @@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) return verify_policy_dir(p->dir); } +static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = RTA_DATA(rt); + return security_xfrm_policy_alloc(pol, uctx); +} + static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int nr) { @@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, } copy_from_user_policy(xp, p); - err = copy_from_user_tmpl(xp, xfrma); + + if (!(err = copy_from_user_tmpl(xp, xfrma))) + err = copy_from_user_sec_ctx(xp, xfrma); + if (err) { *errp = err; kfree(xp); @@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr err = verify_newpolicy_info(p); if (err) return err; + err = verify_sec_ctx_len((struct rtattr **)xfrma); + if (err) + return err; - xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err); if (!xp) return err; @@ -761,6 +849,27 @@ rtattr_failure: return -1; } +static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) +{ + if (xp->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + xp->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = xp->security->ctx_doi; + uctx->ctx_alg = xp->security->ctx_alg; + uctx->ctx_len = xp->security->ctx_len; + memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len); + } + return 0; + + rtattr_failure: + return -1; +} + static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; @@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr copy_to_user_policy(xp, p, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; out: @@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr if (p->index) xp = xfrm_policy_byid(p->dir, p->index, delete); - else - xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + else { + struct rtattr **rtattrs = (struct rtattr **)xfrma; + struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1]; + struct xfrm_policy tmp; + + err = verify_sec_ctx_len(rtattrs); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct xfrm_policy)); + if (rt) { + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + if ((err = security_xfrm_policy_alloc(&tmp, uctx))) + return err; + } + xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete); + security_xfrm_policy_free(&tmp); + } if (xp == NULL) return -ENOENT; @@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, copy_to_user_policy(xp, &upe->pol, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; upe->hard = !!hard; nlh->nlmsg_len = skb->tail - b; @@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; diff --git a/security/Kconfig b/security/Kconfig index 64d3f1e9ca85..34f593410d57 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -54,6 +54,19 @@ config SECURITY_NETWORK implement socket and networking access controls. If you are unsure how to answer this question, answer N. +config SECURITY_NETWORK_XFRM + bool "XFRM (IPSec) Networking Security Hooks" + depends on XFRM && SECURITY_NETWORK + help + This enables the XFRM (IPSec) networking security hooks. + If enabled, a security module can use these hooks to + implement per-packet access controls based on labels + derived from IPSec policy. Non-IPSec communications are + designated as unlabelled, and only sockets authorized + to communicate unlabelled data can send without using + IPSec. + If you are unsure how to answer this question, answer N. + config SECURITY_CAPABILITIES tristate "Default Linux Capabilities" depends on SECURITY diff --git a/security/dummy.c b/security/dummy.c index 3ca5f2b828a0..a15c54709fde 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr static inline void dummy_sk_free_security (struct sock *sk) { } + +static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return 0; +} + +static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp) +{ +} + +static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static void dummy_xfrm_state_free_security(struct xfrm_state *x) +{ +} + +static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return 0; +} +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ static int dummy_register_security (const char *name, struct security_operations *ops) { return -EINVAL; @@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, socket_getpeersec); set_to_dummy_if_null(ops, sk_alloc_security); set_to_dummy_if_null(ops, sk_free_security); -#endif /* CONFIG_SECURITY_NETWORK */ + set_to_dummy_if_null(ops, sk_getsid); + #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + set_to_dummy_if_null(ops, xfrm_policy_alloc_security); + set_to_dummy_if_null(ops, xfrm_policy_clone_security); + set_to_dummy_if_null(ops, xfrm_policy_free_security); + set_to_dummy_if_null(ops, xfrm_state_alloc_security); + set_to_dummy_if_null(ops, xfrm_state_free_security); + set_to_dummy_if_null(ops, xfrm_policy_lookup); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS set_to_dummy_if_null(ops, key_alloc); set_to_dummy_if_null(ops, key_free); diff --git a/security/selinux/Makefile b/security/selinux/Makefile index b038cd0fae2e..06d54d9d20a5 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o selinux-$(CONFIG_SECURITY_NETWORK) += netif.o +selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o + EXTRA_CFLAGS += -Isecurity/selinux/include diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index fc774436a264..3d496eae1b47 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -73,6 +73,7 @@ #include "avc.h" #include "objsec.h" #include "netif.h" +#include "xfrm.h" #define XATTR_SELINUX_SUFFIX "selinux" #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX @@ -3349,6 +3350,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) err = avc_has_perm(sock_sid, port_sid, sock_class, recv_perm, &ad); } + + if (!err) + err = selinux_xfrm_sock_rcv_skb(sock_sid, skb); + out: return err; } @@ -3401,6 +3406,24 @@ static void selinux_sk_free_security(struct sock *sk) sk_free_security(sk); } +static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir) +{ + struct inode_security_struct *isec; + u32 sock_sid = SECINITSID_ANY_SOCKET; + + if (!sk) + return selinux_no_sk_sid(fl); + + read_lock_bh(&sk->sk_callback_lock); + isec = get_sock_isec(sk); + + if (isec) + sock_sid = isec->sid; + + read_unlock_bh(&sk->sk_callback_lock); + return sock_sid; +} + static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb) { int err = 0; @@ -3536,6 +3559,11 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum, send_perm, &ad) ? NF_DROP : NF_ACCEPT; } + if (err != NF_ACCEPT) + goto out; + + err = selinux_xfrm_postroute_last(isec->sid, skb); + out: return err; } @@ -4380,6 +4408,16 @@ static struct security_operations selinux_ops = { .socket_getpeersec = selinux_socket_getpeersec, .sk_alloc_security = selinux_sk_alloc_security, .sk_free_security = selinux_sk_free_security, + .sk_getsid = selinux_sk_getsid_security, +#endif + +#ifdef CONFIG_SECURITY_NETWORK_XFRM + .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc, + .xfrm_policy_clone_security = selinux_xfrm_policy_clone, + .xfrm_policy_free_security = selinux_xfrm_policy_free, + .xfrm_state_alloc_security = selinux_xfrm_state_alloc, + .xfrm_state_free_security = selinux_xfrm_state_free, + .xfrm_policy_lookup = selinux_xfrm_policy_lookup, #endif }; @@ -4491,6 +4529,7 @@ static int __init selinux_nf_ip_init(void) panic("SELinux: nf_register_hook for IPv6: error %d\n", err); #endif /* IPV6 */ + out: return err; } diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 1deb59e1b762..71aeb12f07c8 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -238,3 +238,5 @@ S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost") S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") + S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELFROM, "relabelfrom") + S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELTO, "relabelto") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index a78b5d59c9fc..d1d0996049e3 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -908,6 +908,8 @@ #define ASSOCIATION__SENDTO 0x00000001UL #define ASSOCIATION__RECVFROM 0x00000002UL +#define ASSOCIATION__RELABELFROM 0x00000004UL +#define ASSOCIATION__RELABELTO 0x00000008UL #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL #define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h new file mode 100644 index 000000000000..8e87996c6dd5 --- /dev/null +++ b/security/selinux/include/xfrm.h @@ -0,0 +1,54 @@ +/* + * SELinux support for the XFRM LSM hooks + * + * Author : Trent Jaeger, <jaegert@us.ibm.com> + */ +#ifndef _SELINUX_XFRM_H_ +#define _SELINUX_XFRM_H_ + +int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); +int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new); +void selinux_xfrm_policy_free(struct xfrm_policy *xp); +int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); +void selinux_xfrm_state_free(struct xfrm_state *x); +int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir); + +/* + * Extract the security blob from the sock (it's actually on the socket) + */ +static inline struct inode_security_struct *get_sock_isec(struct sock *sk) +{ + if (!sk->sk_socket) + return NULL; + + return SOCK_INODE(sk->sk_socket)->i_security; +} + + +static inline u32 selinux_no_sk_sid(struct flowi *fl) +{ + /* NOTE: no sock occurs on ICMP reply, forwards, ... */ + /* icmp_reply: authorize as kernel packet */ + if (fl && fl->proto == IPPROTO_ICMP) { + return SECINITSID_KERNEL; + } + + return SECINITSID_ANY_SOCKET; +} + +#ifdef CONFIG_SECURITY_NETWORK_XFRM +int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb); +int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb); +#else +static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb) +{ + return 0; +} + +static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb) +{ + return NF_ACCEPT; +} +#endif + +#endif /* _SELINUX_XFRM_H_ */ diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c new file mode 100644 index 000000000000..c4d87d4dca7b --- /dev/null +++ b/security/selinux/xfrm.c @@ -0,0 +1,311 @@ +/* + * NSA Security-Enhanced Linux (SELinux) security module + * + * This file contains the SELinux XFRM hook function implementations. + * + * Authors: Serge Hallyn <sergeh@us.ibm.com> + * Trent Jaeger <jaegert@us.ibm.com> + * + * Copyright (C) 2005 International Business Machines Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ + +/* + * USAGE: + * NOTES: + * 1. Make sure to enable the following options in your kernel config: + * CONFIG_SECURITY=y + * CONFIG_SECURITY_NETWORK=y + * CONFIG_SECURITY_NETWORK_XFRM=y + * CONFIG_SECURITY_SELINUX=m/y + * ISSUES: + * 1. Caching packets, so they are not dropped during negotiation + * 2. Emulating a reasonable SO_PEERSEC across machines + * 3. Testing addition of sk_policy's with security context via setsockopt + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/security.h> +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/skbuff.h> +#include <linux/xfrm.h> +#include <net/xfrm.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <asm/semaphore.h> + +#include "avc.h" +#include "objsec.h" +#include "xfrm.h" + + +/* + * Returns true if an LSM/SELinux context + */ +static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx) +{ + return (ctx && + (ctx->ctx_doi == XFRM_SC_DOI_LSM) && + (ctx->ctx_alg == XFRM_SC_ALG_SELINUX)); +} + +/* + * Returns true if the xfrm contains a security blob for SELinux + */ +static inline int selinux_authorizable_xfrm(struct xfrm_state *x) +{ + return selinux_authorizable_ctx(x->security); +} + +/* + * LSM hook implementation that authorizes that a socket can be used + * with the corresponding xfrm_sec_ctx and direction. + */ +int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + int rc = 0; + u32 sel_sid = SECINITSID_UNLABELED; + struct xfrm_sec_ctx *ctx; + + /* Context sid is either set to label or ANY_ASSOC */ + if ((ctx = xp->security)) { + if (!selinux_authorizable_ctx(ctx)) + return -EINVAL; + + sel_sid = ctx->ctx_sid; + } + + rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION, + ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM : + ((dir == FLOW_DIR_OUT) ? ASSOCIATION__SENDTO : + (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))), + NULL); + + return rc; +} + +/* + * Security blob allocation for xfrm_policy and xfrm_state + * CTX does not have a meaningful value on input + */ +static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx) +{ + int rc = 0; + struct task_security_struct *tsec = current->security; + struct xfrm_sec_ctx *ctx; + + BUG_ON(!uctx); + BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX); + + if (uctx->ctx_len >= PAGE_SIZE) + return -ENOMEM; + + *ctxp = ctx = kmalloc(sizeof(*ctx) + + uctx->ctx_len, + GFP_KERNEL); + + if (!ctx) + return -ENOMEM; + + ctx->ctx_doi = uctx->ctx_doi; + ctx->ctx_len = uctx->ctx_len; + ctx->ctx_alg = uctx->ctx_alg; + + memcpy(ctx->ctx_str, + uctx+1, + ctx->ctx_len); + rc = security_context_to_sid(ctx->ctx_str, + ctx->ctx_len, + &ctx->ctx_sid); + + if (rc) + goto out; + + /* + * Does the subject have permission to set security or permission to + * do the relabel? + * Must be permitted to relabel from default socket type (process type) + * to specified context + */ + rc = avc_has_perm(tsec->sid, tsec->sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__RELABELFROM, NULL); + if (rc) + goto out; + + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__RELABELTO, NULL); + if (rc) + goto out; + + return rc; + +out: + *ctxp = 0; + kfree(ctx); + return rc; +} + +/* + * LSM hook implementation that allocs and transfers uctx spec to + * xfrm_policy. + */ +int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx) +{ + int err; + + BUG_ON(!xp); + + err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx); + return err; +} + + +/* + * LSM hook implementation that copies security data structure from old to + * new for policy cloning. + */ +int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + struct xfrm_sec_ctx *old_ctx, *new_ctx; + + old_ctx = old->security; + + if (old_ctx) { + new_ctx = new->security = kmalloc(sizeof(*new_ctx) + + old_ctx->ctx_len, + GFP_KERNEL); + + if (!new_ctx) + return -ENOMEM; + + memcpy(new_ctx, old_ctx, sizeof(*new_ctx)); + memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len); + } + return 0; +} + +/* + * LSM hook implementation that frees xfrm_policy security information. + */ +void selinux_xfrm_policy_free(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *ctx = xp->security; + if (ctx) + kfree(ctx); +} + +/* + * LSM hook implementation that allocs and transfers sec_ctx spec to + * xfrm_state. + */ +int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx) +{ + int err; + + BUG_ON(!x); + + err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx); + return err; +} + +/* + * LSM hook implementation that frees xfrm_state security information. + */ +void selinux_xfrm_state_free(struct xfrm_state *x) +{ + struct xfrm_sec_ctx *ctx = x->security; + if (ctx) + kfree(ctx); +} + +/* + * LSM hook that controls access to unlabelled packets. If + * a xfrm_state is authorizable (defined by macro) then it was + * already authorized by the IPSec process. If not, then + * we need to check for unlabelled access since this may not have + * gone thru the IPSec process. + */ +int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb) +{ + int i, rc = 0; + struct sec_path *sp; + + sp = skb->sp; + + if (sp) { + /* + * __xfrm_policy_check does not approve unless xfrm_policy_ok + * says that spi's match for policy and the socket. + * + * Only need to verify the existence of an authorizable sp. + */ + for (i = 0; i < sp->len; i++) { + struct xfrm_state *x = sp->x[i].xvec; + + if (x && selinux_authorizable_xfrm(x)) + goto accept; + } + } + + /* check SELinux sock for unlabelled access */ + rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, + ASSOCIATION__RECVFROM, NULL); + if (rc) + goto drop; + +accept: + return 0; + +drop: + return rc; +} + +/* + * POSTROUTE_LAST hook's XFRM processing: + * If we have no security association, then we need to determine + * whether the socket is allowed to send to an unlabelled destination. + * If we do have a authorizable security association, then it has already been + * checked in xfrm_policy_lookup hook. + */ +int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb) +{ + struct dst_entry *dst; + int rc = 0; + + dst = skb->dst; + + if (dst) { + struct dst_entry *dst_test; + + for (dst_test = dst; dst_test != 0; + dst_test = dst_test->child) { + struct xfrm_state *x = dst_test->xfrm; + + if (x && selinux_authorizable_xfrm(x)) + goto accept; + } + } + + rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, + ASSOCIATION__SENDTO, NULL); + if (rc) + goto drop; + +accept: + return NF_ACCEPT; + +drop: + return NF_DROP; +} |