From c6f408996c625cb950cad024f90e50519f94713c Mon Sep 17 00:00:00 2001 From: Mukund Jampala Date: Sun, 16 Dec 2012 19:25:58 +0100 Subject: netfilter: ip[6]t_REJECT: fix wrong transport header pointer in TCP reset The problem occurs when iptables constructs the tcp reset packet. It doesn't initialize the pointer to the tcp header within the skb. When the skb is passed to the ixgbe driver for transmit, the ixgbe driver attempts to access the tcp header and crashes. Currently, other drivers (such as our 1G e1000e or igb drivers) don't access the tcp header on transmit unless the TSO option is turned on. <1>BUG: unable to handle kernel NULL pointer dereference at 0000000d <1>IP: [] ixgbe_xmit_frame_ring+0x8cc/0x2260 [ixgbe] <4>*pdpt = 0000000085e5d001 *pde = 0000000000000000 <0>Oops: 0000 [#1] SMP [...] <4>Pid: 0, comm: swapper Tainted: P 2.6.35.12 #1 Greencity/Thurley <4>EIP: 0060:[] EFLAGS: 00010246 CPU: 16 <4>EIP is at ixgbe_xmit_frame_ring+0x8cc/0x2260 [ixgbe] <4>EAX: c7628820 EBX: 00000007 ECX: 00000000 EDX: 00000000 <4>ESI: 00000008 EDI: c6882180 EBP: dfc6b000 ESP: ced95c48 <4> DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 <0>Process swapper (pid: 0, ti=ced94000 task=ced73bd0 task.ti=ced94000) <0>Stack: <4> cbec7418 c779e0d8 c77cc888 c77cc8a8 0903010a 00000000 c77c0008 00000002 <4><0> cd4997c0 00000010 dfc6b000 00000000 d0d176c9 c77cc8d8 c6882180 cbec7318 <4><0> 00000004 00000004 cbec7230 cbec7110 00000000 cbec70c0 c779e000 00000002 <0>Call Trace: <4> [] ? 0xd0d176c9 <4> [] ? 0xd0d18a4d <4> [<411e243e>] ? dev_hard_start_xmit+0x218/0x2d7 <4> [<411f03d7>] ? sch_direct_xmit+0x4b/0x114 <4> [<411f056a>] ? __qdisc_run+0xca/0xe0 <4> [<411e28b0>] ? dev_queue_xmit+0x2d1/0x3d0 <4> [<411e8120>] ? neigh_resolve_output+0x1c5/0x20f <4> [<411e94a1>] ? neigh_update+0x29c/0x330 <4> [<4121cf29>] ? arp_process+0x49c/0x4cd <4> [<411f80c9>] ? nf_hook_slow+0x3f/0xac <4> [<4121ca8d>] ? arp_process+0x0/0x4cd <4> [<4121ca8d>] ? arp_process+0x0/0x4cd <4> [<4121c6d5>] ? T.901+0x38/0x3b <4> [<4121c918>] ? arp_rcv+0xa3/0xb4 <4> [<4121ca8d>] ? arp_process+0x0/0x4cd <4> [<411e1173>] ? __netif_receive_skb+0x32b/0x346 <4> [<411e19e1>] ? netif_receive_skb+0x5a/0x5f <4> [<411e1ea9>] ? napi_skb_finish+0x1b/0x30 <4> [] ? ixgbe_xmit_frame_ring+0x1564/0x2260 [ixgbe] <4> [<41013468>] ? lapic_next_event+0x13/0x16 <4> [<410429b2>] ? clockevents_program_event+0xd2/0xe4 <4> [<411e1b03>] ? net_rx_action+0x55/0x127 <4> [<4102da1a>] ? __do_softirq+0x77/0xeb <4> [<4102dab1>] ? do_softirq+0x23/0x27 <4> [<41003a67>] ? do_IRQ+0x7d/0x8e <4> [<41002a69>] ? common_interrupt+0x29/0x30 <4> [<41007bcf>] ? mwait_idle+0x48/0x4d <4> [<4100193b>] ? cpu_idle+0x37/0x4c <0>Code: df 09 d7 0f 94 c2 0f b6 d2 e9 e7 fb ff ff 31 db 31 c0 e9 38 ff ff ff 80 78 06 06 0f 85 3e fb ff ff 8b 7c 24 38 8b 8f b8 00 00 00 <0f> b6 51 0d f6 c2 01 0f 85 27 fb ff ff 80 e2 02 75 0d 8b 6c 24 <0>EIP: [] ixgbe_xmit_frame_ring+0x8cc/0x2260 [ixgbe] SS:ESP Signed-off-by: Mukund Jampala Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_REJECT.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 51f13f8ec724..04b18c1ac345 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -81,6 +81,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) niph->saddr = oiph->daddr; niph->daddr = oiph->saddr; + skb_reset_transport_header(nskb); tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); memset(tcph, 0, sizeof(*tcph)); tcph->source = oth->dest; -- cgit v1.2.3 From c65ef8dc7b1c16379b9fc29e925716a10804af43 Mon Sep 17 00:00:00 2001 From: Andrew Collins Date: Wed, 12 Dec 2012 14:23:49 +0000 Subject: netfilter: nf_nat: Also handle non-ESTABLISHED routing changes in MASQUERADE Since (a0ecb85 netfilter: nf_nat: Handle routing changes in MASQUERADE target), the MASQUERADE target handles routing changes which affect the output interface of a connection, but only for ESTABLISHED connections. It is also possible for NEW connections which already have a conntrack entry to be affected by routing changes. This adds a check to drop entries in the NEW+conntrack state when the oif has changed. Signed-off-by: Andrew Collins Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_nat.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index da2c8a368f68..eeaff7e4acb5 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -124,23 +124,28 @@ nf_nat_ipv4_fn(unsigned int hooknum, ret = nf_nat_rule_find(skb, hooknum, in, out, ct); if (ret != NF_ACCEPT) return ret; - } else + } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + goto oif_changed; + } break; default: /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) { - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; - } + if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + goto oif_changed; } return nf_nat_packet(ct, ctinfo, hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; } static unsigned int -- cgit v1.2.3 From 9dd4a13a89d7c27e51cb87b9e95e82d8999826da Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 3 Jan 2013 19:02:12 +0100 Subject: net/ipv4/ipconfig: really display the BOOTP/DHCP server's address. Up to now, the debug and info messages from the ipconfig subsytem claim to display the IP address of the DHCP/BOOTP server but display instead the IP address of the bootserver. Fix that. Signed-off-by: Philippe De Muyter Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d763701cff1b..a2e50ae80b53 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -136,6 +136,8 @@ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ __be32 ic_gateway = NONE; /* Gateway IP address */ +__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */ + __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ @@ -558,6 +560,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ic_myaddr == NONE) ic_myaddr = tip; ic_servaddr = sip; + ic_addrservaddr = sip; ic_got_reply = IC_RARP; drop_unlock: @@ -1068,7 +1071,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_servaddr = server_id; #ifdef IPCONFIG_DEBUG printk("DHCP: Offered address %pI4 by server %pI4\n", - &ic_myaddr, &ic_servaddr); + &ic_myaddr, &b->iph.saddr); #endif /* The DHCP indicated server address takes * precedence over the bootp header one if @@ -1113,6 +1116,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_dev = dev; ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; + ic_addrservaddr = b->iph.saddr; if (ic_gateway == NONE && b->relay_ip) ic_gateway = b->relay_ip; if (ic_nameservers[0] == NONE) @@ -1268,7 +1272,7 @@ static int __init ic_dynamic(void) printk("IP-Config: Got %s answer from %pI4, ", ((ic_got_reply & IC_RARP) ? "RARP" : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), - &ic_servaddr); + &ic_addrservaddr); pr_cont("my address is %pI4\n", &ic_myaddr); return 0; -- cgit v1.2.3 From c7e2e1d72ed7707239d20525e0ebcad7e3303659 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Sat, 5 Jan 2013 11:19:24 +0000 Subject: ipv4: fix NULL checking in devinet_ioctl() The NULL pointer check `!ifa' should come before its first use. [ Bug origin : commit fd23c3b31107e2fc483301ee923d8a1db14e53f4 (ipv4: Add hash table of interface addresses) in linux-2.6.39 ] Signed-off-by: Xi Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cc06a47f1216..a8e4f2665d5e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -823,9 +823,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); - INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; + INIT_HLIST_NODE(&ifa->hash); if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); else -- cgit v1.2.3 From c9be4a5c49cf51cc70a993f004c5bb30067a65ce Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 7 Jan 2013 21:17:00 +0000 Subject: net: prevent setting ttl=0 via IP_TTL A regression is introduced by the following commit: commit 4d52cfbef6266092d535237ba5a4b981458ab171 Author: Eric Dumazet Date: Tue Jun 2 00:42:16 2009 -0700 net: ipv4/ip_sockglue.c cleanups Pure cleanups but it is not a pure cleanup... - if (val != -1 && (val < 1 || val>255)) + if (val != -1 && (val < 0 || val > 255)) Since there is no reason provided to allow ttl=0, change it back. Reported-by: nitin padalia Cc: nitin padalia Cc: Eric Dumazet Cc: David S. Miller Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3c9d20880283..d9c4f113d709 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -590,7 +590,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TTL: if (optlen < 1) goto e_inval; - if (val != -1 && (val < 0 || val > 255)) + if (val != -1 && (val < 1 || val > 255)) goto e_inval; inet->uc_ttl = val; break; -- cgit v1.2.3 From ff905b1e4aad8ccbbb0d42f7137f19482742ff07 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jan 2013 07:06:10 +0000 Subject: tcp: splice: fix an infinite loop in tcp_read_sock() commit 02275a2ee7c0 (tcp: don't abort splice() after small transfers) added a regression. [ 83.843570] INFO: rcu_sched self-detected stall on CPU [ 83.844575] INFO: rcu_sched detected stalls on CPUs/tasks: { 6} (detected by 0, t=21002 jiffies, g=4457, c=4456, q=13132) [ 83.844582] Task dump for CPU 6: [ 83.844584] netperf R running task 0 8966 8952 0x0000000c [ 83.844587] 0000000000000000 0000000000000006 0000000000006c6c 0000000000000000 [ 83.844589] 000000000000006c 0000000000000096 ffffffff819ce2bc ffffffffffffff10 [ 83.844592] ffffffff81088679 0000000000000010 0000000000000246 ffff880c4b9ddcd8 [ 83.844594] Call Trace: [ 83.844596] [] ? vprintk_emit+0x1c9/0x4c0 [ 83.844601] [] ? schedule+0x29/0x70 [ 83.844606] [] ? tcp_splice_data_recv+0x42/0x50 [ 83.844610] [] ? tcp_read_sock+0xda/0x260 [ 83.844613] [] ? tcp_prequeue_process+0xb0/0xb0 [ 83.844615] [] ? tcp_splice_read+0xc0/0x250 [ 83.844618] [] ? sock_splice_read+0x22/0x30 [ 83.844622] [] ? do_splice_to+0x7b/0xa0 [ 83.844627] [] ? sys_splice+0x59c/0x5d0 [ 83.844630] [] ? putname+0x2b/0x40 [ 83.844633] [] ? do_sys_open+0x174/0x1e0 [ 83.844636] [] ? system_call_fastpath+0x16/0x1b if recv_actor() returns 0, we should stop immediately, because looping wont give a chance to drain the pipe. Signed-off-by: Eric Dumazet Cc: Willy Tarreau Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ca253635f7a..5f173dccd1e9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1482,7 +1482,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } used = recv_actor(desc, skb, offset, len); - if (used < 0) { + if (used <= 0) { if (!copied) copied = used; break; -- cgit v1.2.3 From f26845b43c75d3f32f98d194c1327b5b1e6b3fb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jan 2013 20:59:09 +0000 Subject: tcp: fix splice() and tcp collapsing interaction Under unusual circumstances, TCP collapse can split a big GRO TCP packet while its being used in a splice(socket->pipe) operation. skb_splice_bits() releases the socket lock before calling splice_to_pipe(). [ 1081.353685] WARNING: at net/ipv4/tcp.c:1330 tcp_cleanup_rbuf+0x4d/0xfc() [ 1081.371956] Hardware name: System x3690 X5 -[7148Z68]- [ 1081.391820] cleanup rbuf bug: copied AD3BCF1 seq AD370AF rcvnxt AD3CF13 To fix this problem, we must eat skbs in tcp_recv_skb(). Remove the inline keyword from tcp_recv_skb() definition since it has three call sites. Reported-by: Christian Becker Cc: Willy Tarreau Signed-off-by: Eric Dumazet Tested-by: Willy Tarreau Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5f173dccd1e9..2aa69c8ae60c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1428,12 +1428,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) } #endif -static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; - skb_queue_walk(&sk->sk_receive_queue, skb) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (tcp_hdr(skb)->syn) offset--; @@ -1441,6 +1441,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) *off = offset; return skb; } + /* This looks weird, but this can happen if TCP collapsing + * splitted a fat GRO packet, while we released socket lock + * in skb_splice_bits() + */ + sk_eat_skb(sk, skb, false); } return NULL; } @@ -1520,8 +1525,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ - if (copied > 0) + if (copied > 0) { + tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); + } return copied; } EXPORT_SYMBOL(tcp_read_sock); -- cgit v1.2.3 From 7b514a886ba50e3b99295b00805f0d5ad750ca66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jan 2013 16:18:47 +0000 Subject: tcp: accept RST without ACK flag commit c3ae62af8e755 (tcp: should drop incoming frames without ACK flag set) added a regression on the handling of RST messages. RST should be allowed to come even without ACK bit set. We validate the RST by checking the exact sequence, as requested by RFC 793 and 5961 3.2, in tcp_validate_incoming() Reported-by: Eric Wong Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Eric Wong Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a28e4db8a952..18f97ca76b00 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5543,7 +5543,7 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; - if (!th->ack) + if (!th->ack && !th->rst) goto discard; /* @@ -5988,7 +5988,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } - if (!th->ack) + if (!th->ack && !th->rst) goto discard; if (!tcp_validate_incoming(sk, skb, th, 0)) -- cgit v1.2.3