tcp: switch TCP TS option (RFC 7323) to 1ms clock TCP Timestamps option is defined in RFC 7323 Traditionally on linux, it has been tied to the internal 'jiffies' variable, because it had been a cheap and good enough generator. For TCP flows on the Internet, 1 ms resolution would be much better than 4ms or 10ms (HZ=250 or HZ=100 respectively) For TCP flows in the DC, Google has used usec resolution for more than two years with great success [1] Receive size autotuning (DRS) is indeed more precise and converges faster to optimal window size. This patch converts tp->tcp_mstamp to a plain u64 value storing a 1 usec TCP clock. This choice will allow us to upstream the 1 usec TS option as discussed in IETF 97. [1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 9a568de4818dea9a05af141046bd3e589245ab83 [log] [tgz]
author: Eric Dumazet <edumazet@google.com> Tue May 16 14:00:14 2017 -0700
committer: David S. Miller <davem@davemloft.net> Wed May 17 16:06:01 2017 -0400
tree: 6f1502edf55ecb7205660d62bd683ebcf912cfea
parent: ac9517fcf310327fa3e3b0d8366e4b11236b1b4b [diff] [blame]
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 10e6775..9a5a9e8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c

@@ -441,7 +441,7 @@ void tcp_init_buffer_space(struct sock *sk)
 		tcp_sndbuf_expand(sk);
 
 	tp->rcvq_space.space = tp->rcv_wnd;
-	skb_mstamp_get(&tp->tcp_mstamp);
+	tcp_mstamp_refresh(tp);
 	tp->rcvq_space.time = tp->tcp_mstamp;
 	tp->rcvq_space.seq = tp->copied_seq;
 
@@ -555,11 +555,11 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 {
 	u32 delta_us;
 
-	if (tp->rcv_rtt_est.time.v64 == 0)
+	if (tp->rcv_rtt_est.time == 0)
 		goto new_measure;
 	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 		return;
-	delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time);
+	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
 	tcp_rcv_rtt_update(tp, delta_us, 1);
 
 new_measure:
@@ -571,13 +571,15 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 					  const struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tp->rx_opt.rcv_tsecr &&
 	    (TCP_SKB_CB(skb)->end_seq -
-	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
-		tcp_rcv_rtt_update(tp,
-				   jiffies_to_usecs(tcp_time_stamp -
-						    tp->rx_opt.rcv_tsecr),
-				   0);
+	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
+		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+		u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+
+		tcp_rcv_rtt_update(tp, delta_us, 0);
+	}
 }
 
 /*
@@ -590,7 +592,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	int time;
 	int copied;
 
-	time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time);
+	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
 	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
 		return;
 
@@ -1134,8 +1136,8 @@ struct tcp_sacktag_state {
 	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
 	 * but congestion control should still get an accurate delay signal.
 	 */
-	struct skb_mstamp first_sackt;
-	struct skb_mstamp last_sackt;
+	u64	first_sackt;
+	u64	last_sackt;
 	struct rate_sample *rate;
 	int	flag;
 };
@@ -1200,7 +1202,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 			  struct tcp_sacktag_state *state, u8 sacked,
 			  u32 start_seq, u32 end_seq,
 			  int dup_sack, int pcount,
-			  const struct skb_mstamp *xmit_time)
+			  u64 xmit_time)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int fack_count = state->fack_count;
@@ -1242,9 +1244,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
 							   state->reord);
 				if (!after(end_seq, tp->high_seq))
 					state->flag |= FLAG_ORIG_SACK_ACKED;
-				if (state->first_sackt.v64 == 0)
-					state->first_sackt = *xmit_time;
-				state->last_sackt = *xmit_time;
+				if (state->first_sackt == 0)
+					state->first_sackt = xmit_time;
+				state->last_sackt = xmit_time;
 			}
 
 			if (sacked & TCPCB_LOST) {
@@ -1304,7 +1306,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	 */
 	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
 			start_seq, end_seq, dup_sack, pcount,
-			&skb->skb_mstamp);
+			skb->skb_mstamp);
 	tcp_rate_skb_delivered(sk, skb, state->rate);
 
 	if (skb == tp->lost_skb_hint)
@@ -1356,8 +1358,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		tcp_advance_highest_sack(sk, skb);
 
 	tcp_skb_collapse_tstamp(prev, skb);
-	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
-		TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
+		TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
 
 	tcp_unlink_write_queue(skb, sk);
 	sk_wmem_free_skb(sk, skb);
@@ -1587,7 +1589,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						TCP_SKB_CB(skb)->end_seq,
 						dup_sack,
 						tcp_skb_pcount(skb),
-						&skb->skb_mstamp);
+						skb->skb_mstamp);
 			tcp_rate_skb_delivered(sk, skb, state->rate);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
@@ -2936,9 +2938,12 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
 	 */
 	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
-	    flag & FLAG_ACKED)
-		seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
-							  tp->rx_opt.rcv_tsecr);
+	    flag & FLAG_ACKED) {
+		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+		u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+
+		seq_rtt_us = ca_rtt_us = delta_us;
+	}
 	if (seq_rtt_us < 0)
 		return false;
 
@@ -2960,12 +2965,8 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
 {
 	long rtt_us = -1L;
 
-	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
-		struct skb_mstamp now;
-
-		skb_mstamp_get(&now);
-		rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
-	}
+	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
+		rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
 
 	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
 }
@@ -3003,7 +3004,7 @@ void tcp_rearm_rto(struct sock *sk)
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp =
 				tcp_skb_timestamp(skb) + rto;
-			s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+			s32 delta = (s32)(rto_time_stamp - tcp_jiffies32);
 			/* delta may not be positive if the socket is locked
 			 * when the retrans timer fires and is rescheduled.
 			 */
@@ -3060,9 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			       struct tcp_sacktag_state *sack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct skb_mstamp first_ackt, last_ackt;
+	u64 first_ackt, last_ackt;
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct skb_mstamp *now = &tp->tcp_mstamp;
 	u32 prior_sacked = tp->sacked_out;
 	u32 reord = tp->packets_out;
 	bool fully_acked = true;
@@ -3075,7 +3075,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	bool rtt_update;
 	int flag = 0;
 
-	first_ackt.v64 = 0;
+	first_ackt = 0;
 
 	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -3106,8 +3106,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			flag |= FLAG_RETRANS_DATA_ACKED;
 		} else if (!(sacked & TCPCB_SACKED_ACKED)) {
 			last_ackt = skb->skb_mstamp;
-			WARN_ON_ONCE(last_ackt.v64 == 0);
-			if (!first_ackt.v64)
+			WARN_ON_ONCE(last_ackt == 0);
+			if (!first_ackt)
 				first_ackt = last_ackt;
 
 			last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
@@ -3122,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			tp->delivered += acked_pcount;
 			if (!tcp_skb_spurious_retrans(tp, skb))
 				tcp_rack_advance(tp, sacked, scb->end_seq,
-						 &skb->skb_mstamp);
+						 skb->skb_mstamp);
 		}
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
@@ -3165,13 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
 		flag |= FLAG_SACK_RENEGING;
 
-	if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-		seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
-		ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
+	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
+		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
+		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
 	}
-	if (sack->first_sackt.v64) {
-		sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
-		ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
+	if (sack->first_sackt) {
+		sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
+		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
 	}
 	sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
 	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
@@ -3201,7 +3201,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
 
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
-		   sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
+		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
 		/* Do not re-arm RTO if the sack RTT is measured from data sent
 		 * after when the head was last (re)transmitted. Otherwise the
 		 * timeout may continue to extend in loss recovery.
@@ -3553,7 +3553,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 
-	sack_state.first_sackt.v64 = 0;
+	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
 
 	/* We very likely will need to access write queue head. */
@@ -5356,7 +5356,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	skb_mstamp_get(&tp->tcp_mstamp);
+	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 	/*
@@ -5672,7 +5672,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
-			     tcp_time_stamp)) {
+			     tcp_time_stamp(tp))) {
 			NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_PAWSACTIVEREJECTED);
 			goto reset_and_undo;
@@ -5917,7 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	case TCP_SYN_SENT:
 		tp->rx_opt.saw_tstamp = 0;
-		skb_mstamp_get(&tp->tcp_mstamp);
+		tcp_mstamp_refresh(tp);
 		queued = tcp_rcv_synsent_state_process(sk, skb, th);
 		if (queued >= 0)
 			return queued;
@@ -5929,7 +5929,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		return 0;
 	}
 
-	skb_mstamp_get(&tp->tcp_mstamp);
+	tcp_mstamp_refresh(tp);
 	tp->rx_opt.saw_tstamp = 0;
 	req = tp->fastopen_rsk;
 	if (req) {
@@ -6202,7 +6202,7 @@ static void tcp_openreq_init(struct request_sock *req,
 	req->cookie_ts = 0;
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
-	skb_mstamp_get(&tcp_rsk(req)->snt_synack);
+	tcp_rsk(req)->snt_synack = tcp_clock_us();
 	tcp_rsk(req)->last_oow_ack_time = 0;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
commit	9a568de4818dea9a05af141046bd3e589245ab83	[log] [tgz]
author	Eric Dumazet <edumazet@google.com>	Tue May 16 14:00:14 2017 -0700
committer	David S. Miller <davem@davemloft.net>	Wed May 17 16:06:01 2017 -0400
tree	6f1502edf55ecb7205660d62bd683ebcf912cfea
parent	ac9517fcf310327fa3e3b0d8366e4b11236b1b4b [diff] [blame]