From 7de414a9dd91426318df7b63da024b2b07e53df5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 1 Nov 2018 12:02:37 -0700 Subject: net: drop skb on failure in ip_check_defrag() Most callers of pskb_trim_rcsum() simply drop the skb when it fails, however, ip_check_defrag() still continues to pass the skb up to stack. This is suspicious. In ip_check_defrag(), after we learn the skb is an IP fragment, passing the skb to callers makes no sense, because callers expect fragments are defrag'ed on success. So, dropping the skb when we can't defrag it is reasonable. Note, prior to commit 88078d98d1bb, this is not a big problem as checksum will be fixed up anyway. After it, the checksum is not correct on failure. Found this during code review. Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9b0158fa431f..d6ee343fdb86 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -722,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) - return skb; - if (pskb_trim_rcsum(skb, netoff + len)) - return skb; + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { + kfree_skb(skb); + return NULL; + } + if (pskb_trim_rcsum(skb, netoff + len)) { + kfree_skb(skb); + return NULL; + } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; -- cgit v1.2.3 From 97adaddaa6db7a8af81b9b11e30cbe3628cd6700 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 22:31:41 +0900 Subject: net: bpfilter: fix iptables failure if bpfilter_umh is disabled When iptables command is executed, ip_{set/get}sockopt() try to upload bpfilter.ko if bpfilter is enabled. if it couldn't find bpfilter.ko, command is failed. bpfilter.ko is generated if CONFIG_BPFILTER_UMH is enabled. ip_{set/get}sockopt() only checks CONFIG_BPFILTER. So that if CONFIG_BPFILTER is enabled and CONFIG_BPFILTER_UMH is disabled, iptables command is always failed. test config: CONFIG_BPFILTER=y # CONFIG_BPFILTER_UMH is not set test command: %iptables -L iptables: No chain/target/match by that name. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 26c36cccabdc..fffcc130900e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1246,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); @@ -1559,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); @@ -1596,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -- cgit v1.2.3 From 0d5b9311baf27bb545f187f12ecfd558220c607d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Nov 2018 17:34:27 -0800 Subject: inet: frags: better deal with smp races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple cpus might attempt to insert a new fragment in rhashtable, if for example RPS is buggy, as reported by 배석진 in https://patchwork.ozlabs.org/patch/994601/ We use rhashtable_lookup_get_insert_key() instead of rhashtable_insert_fast() to let cpus losing the race free their own inet_frag_queue and use the one that was inserted by another cpu. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Eric Dumazet Reported-by: 배석진 Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index bcb11f3a27c0..760a9e52e02b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -178,21 +178,22 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - void *arg) + void *arg, + struct inet_frag_queue **prev) { struct inet_frags *f = nf->f; struct inet_frag_queue *q; - int err; q = inet_frag_alloc(nf, f, arg); - if (!q) + if (!q) { + *prev = ERR_PTR(-ENOMEM); return NULL; - + } mod_timer(&q->timer, jiffies + nf->timeout); - err = rhashtable_insert_fast(&nf->rhashtable, &q->node, - f->rhash_params); - if (err < 0) { + *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); @@ -204,22 +205,22 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_queue *fq; + struct inet_frag_queue *fq = NULL, *prev; if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; rcu_read_lock(); - fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); - if (fq) { + prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (!prev) + fq = inet_frag_create(nf, key, &prev); + if (prev && !IS_ERR(prev)) { + fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; - rcu_read_unlock(); - return fq; } rcu_read_unlock(); - - return inet_frag_create(nf, key); + return fq; } EXPORT_SYMBOL(inet_frag_find); -- cgit v1.2.3 From 16f7eb2b77b55da816c4e207f3f9440a8cafc00a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Nov 2018 16:58:19 +0100 Subject: ip_tunnel: don't force DF when MTU is locked The various types of tunnels running over IPv4 can ask to set the DF bit to do PMTU discovery. However, PMTU discovery is subject to the threshold set by the net.ipv4.route.min_pmtu sysctl, and is also disabled on routes with "mtu lock". In those cases, we shouldn't set the DF bit. This patch makes setting the DF bit conditional on the route's MTU locking state. This issue seems to be older than git history. Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index dde671e97829..c248e0dccbe1 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -80,7 +80,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df; iph->protocol = proto; iph->tos = tos; iph->daddr = dst; -- cgit v1.2.3 From cadf9df27e7cf40e390e060a1c71bb86ecde798b Mon Sep 17 00:00:00 2001 From: Stephen Mallon Date: Tue, 20 Nov 2018 19:15:02 +1100 Subject: tcp: Fix SOF_TIMESTAMPING_RX_HARDWARE to use the latest timestamp during TCP coalescing During tcp coalescing ensure that the skb hardware timestamp refers to the highest sequence number data. Previously only the software timestamp was updated during coalescing. Signed-off-by: Stephen Mallon Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2868ef28ce52..e695584bb33f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4363,6 +4363,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; to->tstamp = from->tstamp; + skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; } return true; -- cgit v1.2.3 From 86de5921a3d5dd246df661e09bdd0a6131b39ae3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Nov 2018 05:53:59 -0800 Subject: tcp: defer SACK compression after DupThresh Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Reported-by: Jean-Louis Dupond Tested-by: Jean-Louis Dupond Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 ++++++++++++-- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e695584bb33f..1e37c1388189 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4268,7 +4268,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; @@ -5189,7 +5189,17 @@ send_now: if (!tcp_is_sack(tp) || tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) goto send_now; - tp->compressed_ack++; + + if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { + tp->compressed_ack_rcv_nxt = tp->rcv_nxt; + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = 0; + } + + if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + goto send_now; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9c34b97d365d..3f510cad0b3e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -180,10 +180,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack)) { + if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack); - tp->compressed_ack = 0; + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = TCP_FASTRETRANS_THRESH; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 676020663ce8..5f8b6d3cd855 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -740,7 +740,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, -- cgit v1.2.3