summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c180
1 files changed, 95 insertions, 85 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28e029632493..2549b29b062d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
@@ -77,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_ecn __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
@@ -89,6 +93,8 @@ int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_nometrics_save __read_mostly;
+int sysctl_tcp_thin_dupack __read_mostly;
+
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
@@ -176,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
@@ -253,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
sizeof(struct sk_buff);
- if (sk->sk_sndbuf < 3 * sndmem)
- sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+ if (sk->sk_sndbuf < 3 * sndmem) {
+ sk->sk_sndbuf = 3 * sndmem;
+ if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+ sk->sk_sndbuf = sysctl_tcp_wmem[2];
+ }
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -390,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
}
@@ -416,15 +425,16 @@ void tcp_initialize_rcv_mss(struct sock *sk)
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
* though this reference is out of date. A new paper
* is pending.
*/
@@ -724,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
* Reset our results.
*/
if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst->metrics[RTAX_RTT - 1] = 0;
+ dst_metric_set(dst, RTAX_RTT, 0);
return;
}
@@ -766,57 +776,48 @@ void tcp_update_metrics(struct sock *sk)
if (dst_metric(dst, RTAX_SSTHRESH) &&
!dst_metric_locked(dst, RTAX_SSTHRESH) &&
(tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+ dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
if (!dst_metric_locked(dst, RTAX_CWND) &&
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
+ dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
} else if (tp->snd_cwnd > tp->snd_ssthresh &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] =
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+ dst_metric_set(dst, RTAX_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
+ dst_metric_set(dst, RTAX_CWND,
+ (dst_metric(dst, RTAX_CWND) +
+ tp->snd_cwnd) >> 1);
} else {
/* Else slow start did not finish, cwnd is non-sense,
ssthresh may be also invalid.
*/
if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
+ dst_metric_set(dst, RTAX_CWND,
+ (dst_metric(dst, RTAX_CWND) +
+ tp->snd_ssthresh) >> 1);
if (dst_metric(dst, RTAX_SSTHRESH) &&
!dst_metric_locked(dst, RTAX_SSTHRESH) &&
tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+ dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
}
if (!dst_metric_locked(dst, RTAX_REORDERING)) {
if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
tp->reordering != sysctl_tcp_reordering)
- dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+ dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
}
}
}
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- * The RFC specifies a window of no more than 4380 bytes
- * unless 2*MSS > 4380. Reading the pseudocode in the RFC
- * is a bit misleading because they use a clamp at 4380 bytes
- * rather than use a multiplier in the relevant range.
- */
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
- if (!cwnd) {
- if (tp->mss_cache > 1460)
- cwnd = 2;
- else
- cwnd = (tp->mss_cache > 1095) ? 3 : 4;
- }
+ if (!cwnd)
+ cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -915,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
}
tcp_set_rto(sk);
- if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
- goto reset;
-
-cwnd:
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- return;
-
+ if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
reset:
- /* Play conservative. If timestamps are not
- * supported, TCP will fail to recalculate correct
- * rtt, if initial rto is too small. FORGET ALL AND RESET!
- */
- if (!tp->rx_opt.saw_tstamp && tp->srtt) {
- tp->srtt = 0;
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ /* Play conservative. If timestamps are not
+ * supported, TCP will fail to recalculate correct
+ * rtt, if initial rto is too small. FORGET ALL AND RESET!
+ */
+ if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+ tp->srtt = 0;
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ }
}
- goto cwnd;
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
}
static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2307,7 +2303,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+ return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}
static inline int tcp_head_timedout(struct sock *sk)
@@ -2447,6 +2443,16 @@ static int tcp_time_to_recover(struct sock *sk)
return 1;
}
+ /* If a thin stream is detected, retransmit after first
+ * received dupack. Employ only if SACK is supported in order
+ * to avoid possible corner-case series of spurious retransmissions
+ * Use only if there are no unsent data.
+ */
+ if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+ tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+ tcp_is_sack(tp) && !tcp_send_head(sk))
+ return 1;
+
return 0;
}
@@ -2491,7 +2497,7 @@ static void tcp_timeout_skbs(struct sock *sk)
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -2503,6 +2509,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
} else {
skb = tcp_write_queue_head(sk);
cnt = 0;
@@ -2525,7 +2534,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if (tcp_is_sack(tp) || (oldcnt >= packets))
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
@@ -2536,6 +2546,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
}
tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
}
tcp_verify_left_out(tp);
}
@@ -2547,17 +2560,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1);
+ tcp_mark_head_lost(sk, 1, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost);
+ tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto < fast_rexmit)
- sacked_upto = fast_rexmit;
- tcp_mark_head_lost(sk, sacked_upto);
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
@@ -2623,7 +2637,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
if (sk->sk_family == AF_INET) {
printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
- &inet->daddr, ntohs(inet->dport),
+ &inet->inet_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2633,7 +2647,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct ipv6_pinfo *np = inet6_sk(sk);
printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->dport),
+ &np->daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2866,7 +2880,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
@@ -2922,6 +2936,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
tcp_xmit_retransmit_queue(sk);
}
+EXPORT_SYMBOL(tcp_simple_retransmit);
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
@@ -2962,7 +2977,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
}
@@ -3270,7 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPCB_FLAG_SYN)) {
+ if (!(scb->flags & TCPHDR_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
@@ -3390,8 +3405,8 @@ static void tcp_ack_probe(struct sock *sk)
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
- return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3408,9 +3423,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
- return (after(ack, tp->snd_una) ||
+ return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
@@ -3694,7 +3709,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
- dst_confirm(sk->sk_dst_cache);
+ dst_confirm(__sk_dst_get(sk));
return 1;
@@ -3829,18 +3844,20 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
/* 16-bit multiple */
opt_rx->cookie_plus = opsize;
*hvpp = ptr;
+ break;
default:
/* ignore option */
break;
- };
+ }
break;
- };
+ }
ptr += opsize-2;
length -= opsize;
}
}
}
+EXPORT_SYMBOL(tcp_parse_options);
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
{
@@ -3907,13 +3924,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
- return ptr;
+ return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4024,6 +4042,8 @@ static void tcp_reset(struct sock *sk)
default:
sk->sk_err = ECONNRESET;
}
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
@@ -4303,7 +4323,7 @@ static void tcp_ofo_queue(struct sock *sk)
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
- SOCK_DEBUG(sk, "ofo packet was already received \n");
+ SOCK_DEBUG(sk, "ofo packet was already received\n");
__skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
@@ -4351,6 +4371,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
+ skb_dst_drop(skb);
__skb_pull(skb, th->doff * 4);
TCP_ECN_accept_cwr(tp, skb);
@@ -4842,7 +4863,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
return 0;
/* If we are under soft global TCP memory pressure, do not expand. */
- if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+ if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
return 0;
/* If we filled the congestion window, do not expand. */
@@ -5414,6 +5435,7 @@ discard:
__kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL(tcp_rcv_established);
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
@@ -5783,11 +5805,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
- * Fix it at least with timestamps.
+ * Force it here.
*/
- if (tp->rx_opt.saw_tstamp &&
- tp->rx_opt.rcv_tsecr && !tp->srtt)
- tcp_ack_saw_tstamp(sk, 0);
+ tcp_ack_update_rtt(sk, 0, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5819,7 +5839,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst_confirm(sk->sk_dst_cache);
+ dst_confirm(__sk_dst_get(sk));
if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
@@ -5915,14 +5935,4 @@ discard:
}
return 0;
}
-
-EXPORT_SYMBOL(sysctl_tcp_ecn);
-EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-EXPORT_SYMBOL(tcp_parse_options);
-#ifdef CONFIG_TCP_MD5SIG
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
-#endif
-EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);