TCP滑动窗口的功能是实现流量控制。数据接收方只接收seq落入窗口范围内的数据;发送方也不会发送窗口之外的数据,一旦发现窗口太小则会停止发送直到窗口变大,这样TCP数据接收方就能通过窗口通告来控制数据发送方发送数据的速度。窗口的值存储在TCP报文段的window字段中,大小为16bit,即窗口的最大值是65535。如果使用窗口扩大选项(后续讨论),则通告窗口的值为window左移窗口扩大因子个位数。
关于滑动窗口其实只有三个问题需要明晰:
(1)窗口(包括初始窗口)如何生成(即生产)
(2)窗口信息如何更新(即维护)
(3)窗口如何被使用(即消费)
下面一一解答。
6.2.1 窗口生产
初始窗口的设置在发送SYN和SYN|ACK时进行:
2752 void tcp_connect_init(struct sock *sk) 2753 { 2754 const struct dst_entry *dst = __sk_dst_get(sk); 2755 struct tcp_sock *tp = tcp_sk(sk); 2756 __u8 rcv_wscale; ... 2789 tcp_select_initial_window(tcp_full_space(sk), 2790 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphd r) : 0), //MSS - 时间戳选项大小 2791 &tp->rcv_wnd, 2792 &tp->window_clamp, 2793 sysctl_tcp_window_scaling, 2794 &rcv_wscale, 2795 dst_metric(dst, RTAX_INITRWND)); 2796 2797 tp->rx_opt.rcv_wscale = rcv_wscale; 2798 tp->rcv_ssthresh = tp->rcv_wnd; ...构建SYN|ACK时:
2654 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2655 struct request_sock *req, 2656 struct tcp_fastopen_cookie *foc) 2657 { ... 2692 /* tcp_full_space because it is guaranteed to be the first packet */ 2693 tcp_select_initial_window(tcp_full_space(sk), 2694 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), //MSS - 时间戳选项大小 2695 &req->rcv_wnd, 2696 &req->window_clamp, 2697 ireq->wscale_ok, 2698 &rcv_wscale, 2699 dst_metric(dst, RTAX_INITRWND)); 2700 ireq->rcv_wscale = rcv_wscale; ...这样看来 计算初始窗口大小的功能是由tcp_select_initial_window函数完成的:
191 void tcp_select_initial_window(int __space, __u32 mss, 192 __u32 *rcv_wnd, __u32 *window_clamp, 193 int wscale_ok, __u8 *rcv_wscale, 194 __u32 init_rcv_wnd) 195 { 196 unsigned int space = (__space < 0 ? 0 : __space); //__space是TCP根据接收缓存的大小计算出来的 197 198 /* If no clamp set the clamp to the max possible scaled window */ 199 if (*window_clamp == 0) //*window_clamp的值是通告给对端的最大的窗口值 200 (*window_clamp) = (65535 << 14); //窗口扩大因子最大为14 201 space = min(*window_clamp, space); 202 203 /* Quantize space offering to a multiple of mss if possible. */ 204 if (space > mss) 205 space = (space / mss) * mss; //将space整理为mss的整数倍 ... 215 if (sysctl_tcp_workaround_signed_windows) //使用有符号的接收窗口 216 (*rcv_wnd) = min(space, MAX_TCP_WINDOW); //设置通告窗口小于32767,否则可能会导致一些不稳定的TCP协议实现的崩溃 217 else 218 (*rcv_wnd) = space; //将通告窗口的值设置为space 219 220 (*rcv_wscale) = 0; 221 if (wscale_ok) {//开启窗口扩大选项 222 /* Set window scaling on max possible window 223 * See RFC1323 for an explanation of the limit to 14 224 */ 225 space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); //接收缓存最大空间 226 space = min_t(u32, space, *window_clamp); 227 while (space > 65535 && (*rcv_wscale) < 14) { //计算窗口扩大因子 228 space >>= 1; 229 (*rcv_wscale)++; 230 } 231 } 232 233 /* Set initial window to a value enough for senders starting with 234 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place 235 * a limit on the initial window when mss is larger than 1460. 236 */ 237 if (mss > (1 << *rcv_wscale)) { 238 int init_cwnd = TCP_DEFAULT_INIT_RCVWND; 239 if (mss > 1460) 240 init_cwnd = 241 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); 242 /* when initializing use the value from init_rcv_wnd 243 * rather than the default from above 244 */ 245 if (init_rcv_wnd) 246 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); 247 else 248 *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); 249 } 250 251 /* Set the clamp no higher than max representable value */ 252 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 253 }tcp_full_space函数返回可用空间的大小:
1056 static inline int tcp_win_from_space(int space) 1057 { 1058 return sysctl_tcp_adv_win_scale<=0 ? 1059 (space>>(-sysctl_tcp_adv_win_scale)) : 1060 space - (space>>sysctl_tcp_adv_win_scale); 1061 } ... 1070 static inline int tcp_full_space(const struct sock *sk) 1071 { 1072 return tcp_win_from_space(sk->sk_rcvbuf); 1073 }可见,tcp_full_space函数在sysctl_tcp_adv_win_scale不为0的情况下会返回全部接收缓存空间的一部分的大小。
tcp_transmit_skb函数在构建TCP首部字段时会根据包中是否有SYN标记位做区别处理:
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, 829 gfp_t gfp_mask) 830 { 831 const struct inet_connection_sock *icsk = inet_csk(sk); 832 struct inet_sock *inet; 833 struct tcp_sock *tp; 834 struct tcp_skb_cb *tcb; 835 struct tcp_out_options opts; 836 unsigned int tcp_options_size, tcp_header_size; 837 struct tcp_md5sig_key *md5; 838 struct tcphdr *th; 839 int err; ... 903 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { 904 /* RFC1323: The window in SYN & SYN/ACK segments 905 * is never scaled. 906 */ 907 th->window = htons(min(tp->rcv_wnd, 65535U)); 908 } else { 909 th->window = htons(tcp_select_window(sk)); 910 } ...对于SYN包窗口会取tcp_select_initial_window设置的值(没有超出窗口最大值的情况下);
连接建立完成后发送的报文的窗口会选取tcp_select_window计算的值:
261 static u16 tcp_select_window(struct sock *sk) 262 { 263 struct tcp_sock *tp = tcp_sk(sk); 264 u32 cur_win = tcp_receive_window(tp);//tp->rcv_wnd减去已经接收的数据长度为当前窗口大小 265 u32 new_win = __tcp_select_window(sk);//获得新的窗口大小 266 267 /* Never shrink the offered window */ 268 if (new_win < cur_win) {//出现窗口左移的可能 269 /* Danger Will Robinson! 270 * Don't update rcv_wup/rcv_wnd here or else 271 * we will not be able to advertise a zero 272 * window in time. --DaveM 273 * 274 * Relax Will Robinson. 275 */ 276 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);//将cur_win以1 << tp->rx_opt.rcv_wscale的倍数对齐的结果作为新的窗口 277 } 278 tp->rcv_wnd = new_win; 279 tp->rcv_wup = tp->rcv_nxt; 280 281 /* Make sure we do not exceed the maximum possible 282 * scaled window. 283 */ 284 if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) 285 new_win = min(new_win, MAX_TCP_WINDOW); //有符号窗口不能超过32767 286 else 287 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); //窗口不能超过最大值 288 289 /* RFC1323 scaling applied */ 290 new_win >>= tp->rx_opt.rcv_wscale; //使用窗口扩大因子 291 292 /* If we advertise zero window, disable fast path. */ 293 if (new_win == 0) 294 tp->pred_flags = 0; 295 296 return new_win; 297 }__tcp_select_window函数会根据接收缓存的信息计算新的窗口:
2111 u32 __tcp_select_window(struct sock *sk) 2112 { 2113 struct inet_connection_sock *icsk = inet_csk(sk); 2114 struct tcp_sock *tp = tcp_sk(sk); 2115 /* MSS for the peer's data. Previous versions used mss_clamp 2116 * here. I don't know if the value based on our guesses 2117 * of peer's MSS is better for the performance. It's more correct 2118 * but may be worse for the performance because of rcv_mss 2119 * fluctuations. --SAW 1998/11/1 2120 */ 2121 int mss = icsk->icsk_ack.rcv_mss; 2122 int free_space = tcp_space(sk); //得到可用缓存空间大小 2123 int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); 2124 int window; 2125 2126 if (mss > full_space) 2127 mss = full_space; 2128 2129 if (free_space < (full_space >> 1)) { //接收缓存空余空间小于接收缓存总空间的一半 2130 icsk->icsk_ack.quick = 0; 2131 2132 if (sk_under_memory_pressure(sk)) //内核内存紧张 2133 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 2134 4U * tp->advmss); //减小当前最大窗口值 2135 2136 if (free_space < mss) //可用缓存太少,则返回0窗口 2137 return 0; 2138 } 2139 2140 if (free_space > tp->rcv_ssthresh) 2141 free_space = tp->rcv_ssthresh; 2142 2143 /* Don't do rounding if we are using window scaling, since the 2144 * scaled window will not line up with the MSS boundary anyway. 2145 */ 2146 window = tp->rcv_wnd; 2147 if (tp->rx_opt.rcv_wscale) {//开启了窗口扩大选项 2148 window = free_space; //直接使用可用缓存大小 2149 2150 /* Advertise enough space so that it won't get scaled away. 2151 * Import case: prevent zero window announcement if 2152 * 1<<rcv_wscale > mss. 2153 */ 2154 if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) 2155 window = (((window >> tp->rx_opt.rcv_wscale) + 1) //+ 1是为了保证当 1<<rcv_wcalse > mss时对端至少可以发送一个报文段 2156 << tp->rx_opt.rcv_wscale); 2157 } else { 2158 /* Get the largest window that is a nice multiple of mss. 2159 * Window clamp already applied above. 2160 * If our current window offering is within 1 mss of the 2161 * free space we just keep it. This prevents the divide 2162 * and multiply from happening most of the time. 2163 * We also don't do any window rounding when the free space 2164 * is too small. 2165 */ 2166 if (window <= free_space - mss || window > free_space) //旧的通告窗口过大或过小 2167 window = (free_space / mss) * mss; //调整窗口大小为整数对端的MSS大小 2168 else if (mss == full_space && //可用空间仅仅为一个MSS 2169 free_space > window + (full_space >> 1)) //旧的通告窗口远小于可用空间大小 2170 window = free_space; 2171 } 2172 2173 return window; 2174 }综上可知,TCP数据发送端通告给对端的窗口是接收缓存大小的一种反映。
6.2.2 窗口维护
接收到SYN|ACK和三次握手的ACK时TCP还会调用tcp_init_buffer_space更新一下窗口信息。收到SYN|ACK时:
5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5374 const struct tcphdr *th, unsigned int len) 5375 { ... 5480 tcp_finish_connect(sk, skb); ...
5291 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) 5292 { ... 5315 tcp_init_buffer_space(sk) ...收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5601 const struct tcphdr *th, unsigned int len) 5602 { ... 5682 case TCP_SYN_RECV: ... 5700 tcp_init_buffer_space(sk); ...tcp_init_buffer_space函数用于更新window_clamp信息:
376 void tcp_init_buffer_space(struct sock *sk) 377 { 378 struct tcp_sock *tp = tcp_sk(sk); 379 int maxwin; 380 381 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 382 tcp_fixup_rcvbuf(sk); 383 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) 384 tcp_fixup_sndbuf(sk); 385 386 tp->rcvq_space.space = tp->rcv_wnd; 387 388 maxwin = tcp_full_space(sk); 389 390 if (tp->window_clamp >= maxwin) { 391 tp->window_clamp = maxwin; 392 393 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) 394 tp->window_clamp = max(maxwin - 395 (maxwin >> sysctl_tcp_app_win), 396 4 * tp->advmss); 397 } 398 399 /* Force reservation of one segment. */ 400 if (sysctl_tcp_app_win && 401 tp->window_clamp > 2 * tp->advmss && 402 tp->window_clamp + tp->advmss > maxwin) 403 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); 404 405 tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); 406 tp->snd_cwnd_stamp = tcp_time_stamp; 407 }在“建立”状态下接收窗口的更新情况有:
(1)收到数据调用tcp_event_data_recv函数时:
584 static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) 585 { ... 626 if (skb->len >= 128) 627 tcp_grow_window(sk, skb); 628 }tcp_grow_window函数用于更新当前最大通告窗口值tp->rcv_ssthresh:
305 static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) 306 { 307 struct tcp_sock *tp = tcp_sk(sk); 308 /* Optimize this! */ 309 int truesize = tcp_win_from_space(skb->truesize) >> 1; 310 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; //允许增加的最大window大小 311 312 while (tp->rcv_ssthresh <= window) { 313 if (truesize <= skb->len) 314 return 2 * inet_csk(sk)->icsk_ack.rcv_mss; 315 316 truesize >>= 1; 317 window >>= 1; 318 } 319 return 0; //不增大rcv_ssthresh 320 } 321 322 static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) 323 { 324 struct tcp_sock *tp = tcp_sk(sk); 325 326 /* Check #1 */ 327 if (tp->rcv_ssthresh < tp->window_clamp && //rcv_ssthresh小于上限 328 (int)tp->rcv_ssthresh < tcp_space(sk) && //srcv_sthresh小于可用空间大小 329 !sk_under_memory_pressure(sk)) { //内存不紧张 330 int incr; 331 //可以增大 332 /* Check #2. Increase window, if skb with such overhead 333 * will fit to rcvbuf in future. 334 */ 335 if (tcp_win_from_space(skb->truesize) <= skb->len) //如果只增加skb->truesize大小的空间,但其映射出来的窗口值小于skb中数据的长度 336 incr = 2 * tp->advmss; //增加2个MSS大小 337 else 338 incr = __tcp_grow_window(sk, skb); 339 340 if (incr) { 341 incr = max_t(int, incr, 2 * skb->len); 342 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, 343 tp->window_clamp); //增大rcv_ssthresh 344 inet_csk(sk)->icsk_ack.quick |= 1; //允许快速回复ACK 345 } 346 } 347 }(2)TCP调用tcp_try_rmem_schedule整理接收缓存时:
4061 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 4062 unsigned int size) 4063 { 4064 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || //接收缓存分配超量 4065 !sk_rmem_schedule(sk, skb, size)) { //全局缓存或接收缓存空间达到上限 4066 4067 if (tcp_prune_queue(sk) < 0) //释放一部分缓存,必要时会清空乱序队列 ...tcp_prune_queue函数:
4623 static int tcp_prune_queue(struct sock *sk) 4624 { ... 4631 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) //接收缓存紧张 4632 tcp_clamp_window(sk); 4633 else if (sk_under_memory_pressure(sk)) //内核内存紧张 4634 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); //缩小rcv_ssthreshtcp_clamp_window函数会更新rcv_ssthresh:
410 static void tcp_clamp_window(struct sock *sk) 411 { 412 struct tcp_sock *tp = tcp_sk(sk); 413 struct inet_connection_sock *icsk = inet_csk(sk); 414 415 icsk->icsk_ack.quick = 0; 416 417 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 418 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 419 !sk_under_memory_pressure(sk) && 420 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { 421 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 422 sysctl_tcp_rmem[2]); 423 } 424 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 425 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); 426 }
(3)应用进程通过系统调用copy完数据后,调用tcp_rcv_space_adjust调整接收缓存空间:
522 void tcp_rcv_space_adjust(struct sock *sk) 523 { ... 535 space = 2 * (tp->copied_seq - tp->rcvq_space.seq); 536 537 space = max(tp->rcvq_space.space, space); 538 539 if (tp->rcvq_space.space != space) { ... 544 if (sysctl_tcp_moderate_rcvbuf && 545 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 546 int new_clamp = space; 547 548 /* Receive space grows, normalize in order to 549 * take into account packet headers and sk_buff 550 * structure overhead. 551 */ 552 space /= tp->advmss; //space转换为最大报文段个数 553 if (!space) 554 space = 1; 555 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 556 while (tcp_win_from_space(rcvmem) < tp->advmss) 557 rcvmem += 128; 558 space *= rcvmem; 559 space = min(space, sysctl_tcp_rmem[2]); 560 if (space > sk->sk_rcvbuf) { 561 sk->sk_rcvbuf = space; 562 563 /* Make the window clamp follow along. */ 564 tp->window_clamp = new_clamp; //更新最大通告窗口值 ...
数据接收方发送的通告窗口被数据发送方接收到后,就称为发送窗口。对于发送窗口,发包方收到ACK后会调用tcp_ack函数进行处理。tcp_ack函数在慢速处理路径会调用tcp_ack_update_window函数更新发送窗口(在快速处理路径下通告窗口不变,发送窗口也就不变):
3218 static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, 3219 u32 ack_seq) 3220 { 3221 struct tcp_sock *tp = tcp_sk(sk); 3222 int flag = 0; 3223 u32 nwin = ntohs(tcp_hdr(skb)->window); //得到通告窗口 3224 3225 if (likely(!tcp_hdr(skb)->syn)) //非SYN包 3226 nwin <<= tp->rx_opt.snd_wscale; //得到真正的窗口大小 3227 3228 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { //可以进行窗口更新 3229 flag |= FLAG_WIN_UPDATE; 3230 tcp_update_wl(tp, ack_seq); //记录更新窗口时的ack_seq 3231 3232 if (tp->snd_wnd != nwin) { //通告窗口变化 3233 tp->snd_wnd = nwin; //更新发送窗口 3234 3235 /* Note, it is the only place, where 3236 * fast path is recovered for sending TCP. 3237 */ 3238 tp->pred_flags = 0; 3239 tcp_fast_path_check(sk); //试着开启快速处理路径 3240 3241 if (nwin > tp->max_window) { 3242 tp->max_window = nwin; //更新最大发送窗口信息 3243 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); //更新MSS信息 3244 } 3245 } 3246 } 3247 3248 tp->snd_una = ack; 3249 3250 return flag; 3251 }tcp_may_update_window函数判断是否允许更新窗口:
3204 static inline bool tcp_may_update_window(const struct tcp_sock *tp, 3205 const u32 ack, const u32 ack_seq, 3206 const u32 nwin) 3207 { 3208 return after(ack, tp->snd_una) || //确认了一部分数据 3209 after(ack_seq, tp->snd_wl1) || //从上次更新窗口到现在对端收到了新的数据 3210 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); //报文的seq和ack_seq都没有变化,但窗口变大了,即为窗口更新报文 3211 }3208-3209:有更多的新数据被确认时通常对端的接收缓存大小会改变,从而通告窗口会变化。即使窗口不变,旧数据被确认后窗口的右边缘右移,从而可以发送更多数据。
6.2.3 窗口消费
窗口的值对于数据收发双方有着不同的意义:数据发送方利用窗口判断数据是否可以发送;接收方使用窗口来过滤接收到的数据。先看发送方:
1811 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 1812 int push_one, gfp_t gfp) 1813 { 1814 struct tcp_sock *tp = tcp_sk(sk); 1815 struct sk_buff *skb; 1816 unsigned int tso_segs, sent_pkts; 1817 int cwnd_quota; 1818 int result; ... 1832 while ((skb = tcp_send_head(sk))) { 1833 unsigned int limit; ... 1851 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) //检查发送窗口是否允许发送数据 1852 break; //不允许则中止发送tcp_snd_wnd_test函数检查发送窗口:
1490 static bool tcp_snd_wnd_test(const struct tcp_sock *tp, 1491 const struct sk_buff *skb, 1492 unsigned int cur_mss) 1493 { 1494 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 1495 1496 if (skb->len > cur_mss) 1497 end_seq = TCP_SKB_CB(skb)->seq + cur_mss; 1498 1499 return !after(end_seq, tcp_wnd_end(tp));//tcp_wnd_end(tp)为tp->snd_una + tp->snd_wnd,即发送窗口允许发送的最高序列号 1500 }只有当end_seq <= snd_una + tp_snd_wnd时发送的数据才能全部落入窗口之内。
再来看数据接收端,慢速路径中在检查报文seq合法性时会调用tcp_sequence来检查数据是否在窗口之内:
3738 static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) 3739 { 3740 return !before(end_seq, tp->rcv_wup) && 3741 !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); 3742 }3740:如果before(end_seq, rcv_wup)为真,则意味着全部数据段都在窗口左边,即全部为旧数据
3741:如果after(seq, rcv_nxt + rcv_win)为真,则说明全部数据端位于窗口右边,即超出窗口
tcp_receive_window的功能是计算接收窗口(即对端的发送窗口):
651 static inline u32 tcp_receive_window(const struct tcp_sock *tp) 652 { 653 s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; 654 655 if (win < 0) 656 win = 0; 657 return (u32) win; 658 }653:rcv_wup为最后一次发送窗口通告时的rcv_nxt,rcv_wnd为最后一次发送窗口通告时的窗口值,rcv_nxt - rcv_wup为最后一次发送窗口通告到现在所收到的数据长度(不包括乱序数据),当前接收窗口的大小为:最后一次通告的窗口 - 当前已经接收的数据长度 = rcv_wnd - (rcv_nxt - rcv_wup) = rcv_wup + rcv_wnd - rcv_nxt。
tcp_data_queue中会进行更严格的检查:
4300 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4301 { ... 4321 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 4322 if (tcp_receive_window(tp) == 0) //0窗口,不能接收数据 4323 goto out_of_window; ... 4385 out_of_window: 4386 tcp_enter_quickack_mode(sk); 4387 inet_csk_schedule_ack(sk); 4388 drop: 4389 __kfree_skb(skb); 4390 return; 4391 } ... 4392 4393 /* Out of window. F.e. zero window probe. */ 4394 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) //数据全部在窗口之外 4395 goto out_of_window; .. 4399 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 4400 /* Partial packet, seq < rcv_next < end_seq */ 4401 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", 4402 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4403 TCP_SKB_CB(skb)->end_seq); 4404 4405 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); 4406 4407 /* If window is closed, drop tail of packet. But after 4408 * remembering D-SACK for its head made in previous line. 4409 */ 4410 if (!tcp_receive_window(tp)) //0窗口,乱序数据也不能收 4411 goto out_of_window; ...4394-4395:能到达tcp_data_queue的包必然通过了tcp_sequence的检查,即seq <= rcv_nxt + rcv_win成立。但如果seq == rcv_nxt + rcv_win为真,则在此处也会被判断为“out of window”,被丢弃,也就是说tcp_data_queue的检查比tcp_sequeue多了一个“相等”的情况。为什么不在tcp_sequeuce中做这个严格的检查呢?我认为对于seq == rcv_nxt + rcv_wind为真的包,其ack_seq可能是合法的,tcp_ack函数需要用这个包头中的ack_seq和窗口等信息;由于tcp_data_queue只对数据感兴趣,这种“没有一字节数据在窗口内”的包自然就没有用了。
至此,TCP滑动窗口机制的探究就到这里了,我们还会在后续的旅程中再次看到它的身影。