diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-17 16:26:30 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-17 16:26:30 -0700 |
commit | a7fd20d1c476af4563e66865213474a2f9f473a4 (patch) | |
tree | fb1399e2f82842450245fb058a8fb23c52865f43 /net/ipv4/tcp_output.c | |
parent | b80fed9595513384424cd141923c9161c4b5021b (diff) | |
parent | 917fa5353da05e8a0045b8acacba8d50400d5b12 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Support SPI based w5100 devices, from Akinobu Mita.
2) Partial Segmentation Offload, from Alexander Duyck.
3) Add GMAC4 support to stmmac driver, from Alexandre TORGUE.
4) Allow cls_flower stats offload, from Amir Vadai.
5) Implement bpf blinding, from Daniel Borkmann.
6) Optimize _ASYNC_ bit twiddling on sockets, unless the socket is
actually using FASYNC these atomics are superfluous. From Eric
Dumazet.
7) Run TCP more preemptibly, also from Eric Dumazet.
8) Support LED blinking, EEPROM dumps, and rxvlan offloading in mlx5e
driver, from Gal Pressman.
9) Allow creating ppp devices via rtnetlink, from Guillaume Nault.
10) Improve BPF usage documentation, from Jesper Dangaard Brouer.
11) Support tunneling offloads in qed, from Manish Chopra.
12) aRFS offloading in mlx5e, from Maor Gottlieb.
13) Add RFS and RPS support to SCTP protocol, from Marcelo Ricardo
Leitner.
14) Add MSG_EOR support to TCP, this allows controlling packet
coalescing on application record boundaries for more accurate
socket timestamp sampling. From Martin KaFai Lau.
15) Fix alignment of 64-bit netlink attributes across the board, from
Nicolas Dichtel.
16) Per-vlan stats in bridging, from Nikolay Aleksandrov.
17) Several conversions of drivers to ethtool ksettings, from Philippe
Reynes.
18) Checksum neutral ILA in ipv6, from Tom Herbert.
19) Factorize all of the various marvell dsa drivers into one, from
Vivien Didelot
20) Add VF support to qed driver, from Yuval Mintz"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1649 commits)
Revert "phy dp83867: Fix compilation with CONFIG_OF_MDIO=m"
Revert "phy dp83867: Make rgmii parameters optional"
r8169: default to 64-bit DMA on recent PCIe chips
phy dp83867: Make rgmii parameters optional
phy dp83867: Fix compilation with CONFIG_OF_MDIO=m
bpf: arm64: remove callee-save registers use for tmp registers
asix: Fix offset calculation in asix_rx_fixup() causing slow transmissions
switchdev: pass pointer to fib_info instead of copy
net_sched: close another race condition in tcf_mirred_release()
tipc: fix nametable publication field in nl compat
drivers: net: Don't print unpopulated net_device name
qed: add support for dcbx.
ravb: Add missing free_irq() calls to ravb_close()
qed: Remove a stray tab
net: ethernet: fec-mpc52xx: use phy_ethtool_{get|set}_link_ksettings
net: ethernet: fec-mpc52xx: use phydev from struct net_device
bpf, doc: fix typo on bpf_asm descriptions
stmmac: hardware TX COE doesn't work when force_thresh_dma_mode is set
net: ethernet: fs-enet: use phy_ethtool_{get|set}_link_ksettings
net: ethernet: fs-enet: use phydev from struct net_device
...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 165 |
1 files changed, 93 insertions, 72 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 79a03b87a771..8bd9911fdd16 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -364,7 +364,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, - int tcp_header_len) + struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -375,7 +375,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; - tcp_hdr(skb)->cwr = 1; + th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { @@ -383,7 +383,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) - tcp_hdr(skb)->ece = 1; + th->ece = 1; } } @@ -949,12 +949,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; - skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; + skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ - th = tcp_hdr(skb); + th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); @@ -962,14 +962,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); - if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(min(tp->rcv_wnd, 65535U)); - } else { - th->window = htons(tcp_select_window(sk)); - } th->check = 0; th->urg_ptr = 0; @@ -986,9 +978,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; - if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - tcp_ecn_send(sk, skb, tcp_header_size); - + if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { + th->window = htons(tcp_select_window(sk)); + tcp_ecn_send(sk, skb, th, tcp_header_size); + } else { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(min(tp->rcv_wnd, 65535U)); + } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { @@ -1111,11 +1109,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } +static bool tcp_has_tx_tstamp(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->txstamp_ack || + (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); +} + static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); - if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && + if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; @@ -1123,9 +1127,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); + TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; + TCP_SKB_CB(skb)->txstamp_ack = 0; } } +static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) +{ + TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; + TCP_SKB_CB(skb)->eor = 0; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -1171,6 +1183,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; + tcp_skb_fragment_eor(skb, buff); if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ @@ -1731,6 +1744,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; + tcp_skb_fragment_eor(skb, buff); + buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2204,14 +2219,13 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! - * Note: This is called from BH context only. */ static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } return false; @@ -2266,14 +2280,14 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - if (__tcp_retransmit_skb(sk, skb)) + if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; probe_sent: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: @@ -2444,14 +2458,15 @@ u32 __tcp_select_window(struct sock *sk) void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { - const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); - u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; - - if (unlikely(tsflags)) { + if (unlikely(tcp_has_tx_tstamp(next_skb))) { + const struct skb_shared_info *next_shinfo = + skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); - shinfo->tx_flags |= tsflags; + shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; + TCP_SKB_CB(skb)->txstamp_ack |= + TCP_SKB_CB(next_skb)->txstamp_ack; } } @@ -2490,6 +2505,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); @@ -2541,6 +2557,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; + if (!tcp_skb_can_collapse_to(to)) + break; + space -= skb->len; if (first) { @@ -2567,17 +2586,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; - int err; + int diff, len, err; + - /* Inconslusive MTU probe */ - if (icsk->icsk_mtup.probe_size) { + /* Inconclusive MTU probe */ + if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - } /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. @@ -2610,30 +2629,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) + len = cur_mss * segs; + if (skb->len > len) { + if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - int oldpcount = tcp_skb_pcount(skb); + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; - if (unlikely(oldpcount > 1)) { - if (skb_unclone(skb, GFP_ATOMIC)) - return -ENOMEM; - tcp_init_tso_segs(skb, cur_mss); - tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); - } + diff = tcp_skb_pcount(skb); + tcp_set_skb_tso_segs(skb, cur_mss); + diff -= tcp_skb_pcount(skb); + if (diff) + tcp_adjust_pcount(sk, skb, diff); + if (skb->len < cur_mss) + tcp_retrans_try_collapse(sk, skb, cur_mss); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); - tcp_retrans_try_collapse(sk, skb, cur_mss); - - /* Make a copy, if the first transmission SKB clone we made - * is still in somebody's hands, else make a clone. - */ - /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. @@ -2651,20 +2667,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } if (likely(!err)) { + segs = tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans += segs; } return err; } -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); - int err = __tcp_retransmit_skb(sk, skb); + int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 @@ -2680,7 +2698,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->retrans_stamp = tcp_skb_timestamp(skb); } else if (err != -EBUSY) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } if (tp->undo_retrans < 0) @@ -2755,6 +2773,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + int segs; if (skb == tcp_send_head(sk)) break; @@ -2762,14 +2781,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!hole) tp->retransmit_skb_hint = skb; - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + segs = tp->snd_cwnd - tcp_packets_in_flight(tp); + if (segs <= 0) return; if (fwd_rexmitting) { @@ -2806,10 +2819,10 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); @@ -2962,7 +2975,7 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - bool attach_req) + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2982,14 +2995,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - if (attach_req) { + switch (synack_type) { + case TCP_SYNACK_NORMAL: skb_set_owner_w(skb, req_to_sk(req)); - } else { + break; + case TCP_SYNACK_COOKIE: + /* Under synflood, we do not attach skb to a socket, + * to avoid false sharing. + */ + break; + case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); + break; } skb_dst_set(skb, dst); @@ -3017,7 +3038,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - th = tcp_hdr(skb); + th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; @@ -3038,7 +3059,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -3534,10 +3555,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); } return res; } |