260 likes | 613 Views
CS 498 Lecture 18 TCP Implementation in Linux. Jennifer Hou Department of Computer Science University of Illinois at Urbana-Champaign Reading: Chapter 24, The Linux Networking Architecture: Design and Implementation of Network Protocols in the Linux Kernel. Flow Control.
E N D
CS 498 Lecture 18 TCP Implementation in Linux Jennifer Hou Department of Computer Science University of Illinois at Urbana-Champaign Reading: Chapter 24, The Linux Networking Architecture: Design and Implementation of Network Protocols in the Linux Kernel
tcp_select_window() • Is invoked in the tcp_transmit_skb() method when a TCP segment is sent to specify the advertised window. • Invokes tcp_receive_window() to obtain the current advertised window size. • Invokes __tcp_select_window() to obtain the available space in the computer. • Advances the receiver window and determines the advertised window size, by ensuring that the credit already granted is not taken away.
sk->data_ready send TCP tcp_ack_snd_check tcp_data _queue tcp_sendmsg Fast Path Retrans.Timer tcp_send_(delayed)_ack tcp_send_skb tcp_data tcp_data_snd_check Abschnitt 24.3 SlowPath tcp_write_timer PureACK tcp_rcv_state_process tcp_ack tcp_re -transmit_skb tcp_v4_do_rcv tcp_write_xmit __tcp_v4_lookup() tcp_transmit_skb tcp_v4_rcv tcp_rcv_established TCP_ESTABLISHED ip_input.c ip_output.c ip_local_deliver ip_queue_xmit TCP Implementation in Linux
tcp_select_window() static __inline__ u16tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ if(new_win < cur_win) { new_win = cur_win; } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. if (new_win == 0) tp->pred_flags = 0; return new_win; 252 }
Window Kept at the Receiver Data received and acknowledged Data not yet acknowledged Remaining transmit credit Sequence number rcv_wup rcv_nxt rcv_wup + rcv_wnd
tcp_receive_window() static __inline__ u32tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; }
__tcp_select_window() u32__tcp_select_window(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int mss = tp->ack.rcv_mss; int free_space = tcp_space(sk); (…) if (free_space < mss) return 0; window = tp->rcv_wnd; if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; return window; } If the old credit is larger or smaller by more than one MSS, the available buffer, window, is set to the next smaller multiple of a MSS.
Window Scaling Option • The TCP protocol header has a 16-bit window field a transmit credit of 65535 bytes can be granted. • With the transmission rate of 10 Mbps and round-trip time of 100 ms, a connection would require a transmit window of 125000 bytes. • The window scaling option is used to increase the value range for the advertised window size from 216 to 216 . 2F, where F is the exponent specified by this option.
Window Scaling Option • Can only be sent in a SYN or SYN-ACK segment. • The maximum scaling factor is limited to 214, i.e., the maximum byte sequence number is 216. 214 = 230 < 231 to prevent byte-sequence-number overflow. 1 Byte 1 Byte 1 Byte Shift count Type:4 Len:3
Zero-Window Probing • To prevent deadlock in the case that the advertised window is zero and no packet can be sent (and hence obtains the update on the advertised window in the corresponding ack),TCP keeps an additional timer. • When the timer expires, TCP sends a zero-byte packet. static void tcp_probe_timer(struct sock *sk) { if (tp->probes_out > max_probes) { tcp_write_err(sk); } else { tcp_send_probe0(sk); }
tcp_send_probe0() void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int err; err = tcp_write_wakeup(sk); if (tp->packets_out || !tp->send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; tp->backoff = 0; return; } if (err <= 0) { tp->backoff++; tp->probes_out++; tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, min(tp->rto << tp->backoff, TCP_RTO_MAX)); } else { ………..} } Generates and sends a zero-window probe packet If tpsend_head == NULL or there exist outstanding packets, there is no need to send probe packets If a packet has been sent Min(tprto . 2tpbackoff, TCP_RTO_MAX)
tcp_write_wakeup() int tcp_write_wakeup(struct sock *sk) { (…) if ((skb = tp->send_head) != NULL && before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); (…) return err; } else { return tcp_xmit_probe_skb(sk, 0); } }
tcp_xmit_probe_skb() static int tcp_xmit_probe_skb(struct sock *sk, int urgent) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (skb == NULL) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb); }
tcp_ack_probe() • tcp_ack_probe(sk,ack) is invoked in tcp_ack() when a TCP segment with the ACK flag is received and the segment is a zero-window probe segment. static void tcp_ack_probe(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(tp->send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { tp->backoff = 0; tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); } else { tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, min(tp->rto << tp->backoff, TCP_RTO_MAX)); } }
tcp_v4_init_sock() static int tcp_v4_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); tp->rto = TCP_TIMEOUT_INIT; (…) tp->snd_cwnd = 2; tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; (…) sk->state = TCP_CLOSE; (…) return 0; }
reno_cong_avoid() • tcp_cong_avoid(tp) implements the slow-start and congestion-avoidance algorithm. • Is invoked when an incoming TCP segment with valid acknowledgement is handled in tcp_ack(). • Implements the congestion avoidance phase with the use of tpsnd_cwnd_cnt.
reno_cong_avoid() static __inline__ void reno_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } else { if (tp->snd_cwnd_cnt >= tpsnd_cwnd) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt=0; } else tp->snd_cwnd_cnt++; } tp->snd_cwnd_stamp = tcp_time_stamp; }
tcp_enter_loss() • tcp_enter_loss(sk,how) is invoked in the handling routine of tcp_retransmit_timer. void tcp_enter_loss(struct sock *sk, int how) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *skb; int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; (…)
tcp_recalc_ssthresh() static inline __u32tcp_recalc_ssthresh(struct tcp_opt *tp) { return max(tp->snd_cwnd >> 1, 2); }
Fast Retransmit and Fast Recovery • When three acknowledgement duplicates are received • the variable tpsnd_ssthresh is set to ½ current window size. • The missing segment is retransmitted. • tpsnd_cwnd takes the value tpssthresh + 3 x MSS. • Each time a duplicate ack is received, the congestion window tpsnd_cwnd increases by MSS, and an additional segment is sent. • When the first ack of new data arrives, tpsnd_cwnd takes the original value of tpsnd_ssthresh (stored in tpprior_ssthresh).
Nagle Algorithm • Nagle algorithm (small-packet-avoidance algorithm) aims to avoid excessive network load due to a large number of small TCP packets. tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int nonagle) { return (skb->len < mss_now && !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && (nonagle == 2 || (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } static __inline__ int tcp_minshall_check(struct tcp_opt *tp) { return after(tp->snd_sml,tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } Only complete packets will be sent