客户端发送的SYN请求到达服务器的网卡后,进入服务器操作系统的网络协议栈,经过链路层和网络层的处理后,抵达TCP协议的入口函数。TCPv4的入口函数是tcp_v4_rcv,TCPv6的入口函数是tcp_v6_rcv。下面对tcp_v4_rcv进行分析:
1961 int tcp_v4_rcv(struct sk_buff *skb)代码解析:
1962 {
1963 const struct iphdr *iph;
1964 const struct tcphdr *th;
1965 struct sock *sk;
1966 int ret;
...
1975 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1976 goto discard_it;
1977
1978 th = tcp_hdr(skb);
1979
1980 if (th->doff < sizeof(struct tcphdr) / 4) //检查TCP头长度是否大于最小值
1981 goto bad_packet;
1982 if (!pskb_may_pull(skb, th->doff * 4))
1983 goto discard_it;
1984
1985 /* An explanation is required here, I think.
1986 * Packet length and doff are validated by header prediction,
1987 * provided case of th->doff==0 is eliminated.
1988 * So, we defer the checks. */
1989 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) //校验检验和
1990 goto csum_error;
1991
1992 th = tcp_hdr(skb);
1993 iph = ip_hdr(skb);
1994 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1995 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1996 skb->len - th->doff * 4);
1997 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1998 TCP_SKB_CB(skb)->when = 0;
1999 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2000 TCP_SKB_CB(skb)->sacked = 0;
2001
2002 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2003 if (!sk)
2004 goto no_tcp_socket;
...
2023
2024 bh_lock_sock_nested(sk); //加自旋锁,以防止其它软中断同时访问当前socket
2025 ret = 0;
2026 if (!sock_owned_by_user(sk)) {//如果没有进程锁定此socket;对于SYN此条件为真
...
2035 {
2036 if (!tcp_prequeue(sk, skb)) //判断是否加入到prequeue中;若无进程等待使用prequeue的话则调用tcp_v4_do_rcv函数进行处理
2037 ret = tcp_v4_do_rcv(sk, skb);//进入主处理函数
2038 }
...
2043 goto discard_and_relse;
2044 }
2045 bh_unlock_sock(sk);
2046
2047 sock_put(sk);
2048
2049 return ret;
2050
1975:pskb_may_pull的作用就是检测skb对应的主buf中是否有足够的空间来pull出len长度,如果不够就重新分配skb并将frags中的数据整合进连续空间中;SYN包则不存在frags中的数据。
1994-2000:将报文中的一些信息记录到skb的cb字段中以便使用;
2002:__inet_lookup_skb会根据报文的源/目的IP和源/目的端口等信息在tcp_hashinfo中查找连接。先查找ehash,即established连接表,对于SYN请求这次查找会失败;然后再找listening_hash,这时会找到server端在listen系统调用中加入到listening_hash表中的socket。
下面分析主处理函数tcp_v4_do_rcv:
1800 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)server端socket的状态一定是TCP_LISTEN,即1835行的判断成立。tcp_v4_hnd_req用于查找一个处于“半建立”状态(即发送SYN|ACK,等待ACK)的连接:
1801 {
1802 struct sock *rsk;
...
1835 if (sk->sk_state == TCP_LISTEN) {
1836 struct sock *nsk = tcp_v4_hnd_req(sk, skb); //查找request sock
1837 if (!nsk)
1838 goto discard;
1839
1840 if (nsk != sk) {
1841 sock_rps_save_rxhash(nsk, skb);
1842 if (tcp_child_process(sk, nsk, skb)) {
1843 rsk = nsk;
1844 goto reset;
1845 }
1846 return 0;
1847 }
1848 } else
1849 sock_rps_save_rxhash(sk, skb);
1850
1851 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1852 rsk = sk;
1853 goto reset;
1854 }
1855 return 0;
...
1739 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)值得注意的是,1751-1752行又进行了一次在established表中的连接查找。在tcp_v4_rcv函数的2002行的__inet_lookup_skb函数不是查过一回了吗?tcp_v4_hnd_req函数中查找是否不必要呢?很必要!因为Linux支持多处理器,考虑如下场景:sever端的机器有两个CPU,如果client先后发送了两个SYN,第一个SYN到来后由CPU0处理,先调用__inet_lookup_skb查找socket,这时只能在listen表中查找;找到之后在锁定这个socket之前CPU1收到了第二个SYN,并以迅雷不及掩耳的速度完成了三次握手并建立了established连接,然后CPU0才慢吞吞的锁定socket,调用tcp_v4_hnd_req函数,这时iinet_lookup_established函数就会查找到由CPU1建立的socket。如果没有1751-1752行的处理,在这种情况下server端的TCP就会错误的再次发送SYN|ACK并重复建立连接。上述场景发生的概率虽然很低,但并非为零。可见这个“查漏补缺”的查找的必要性。
1740 {
1741 struct tcphdr *th = tcp_hdr(skb);
1742 const struct iphdr *iph = ip_hdr(skb);
1743 struct sock *nsk;
1744 struct request_sock **prev;
1745 /* Find possible connection requests. */
1746 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1747 iph->saddr, iph->daddr);
1748 if (req) //对于第一个SYN包req一定是NULL
1749 return tcp_check_req(sk, skb, req, prev, false);
1750
1751 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1752 th->source, iph->daddr, th->dest, inet_iif(skb));
1753
1754 if (nsk) { //对于第一个SYN包nsk一定是NULL
1755 if (nsk->sk_state != TCP_TIME_WAIT) {
1756 bh_lock_sock(nsk);
1757 return nsk;
1758 }
1759 inet_twsk_put(inet_twsk(nsk));
1760 return NULL;
1761 }
...
1767 return sk;
1768 }
通常情况下,对于第一个SYN请求tcp_v4_hnd_req会在1767行返回,即返回listen socket的指针。这样对于tcp_v4_do_rcv函数,1840行的判断会为假,代码直接跳转到1851行的tcp_rcv_state_process函数:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,icsk->icsk_af_ops->conn_request指向tcp_v4_conn_request函数:
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5604 struct inet_connection_sock *icsk = inet_csk(sk);
...
5610 switch (sk->sk_state) {
...
5614 case TCP_LISTEN:
...
5621 if (th->syn) {
5622 if (th->fin)
5623 goto discard; //SYN包中不允许置FIN标志位
5624 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5625 return 1;
...
5644 kfree_skb(skb);
5645 return 0;
5646 }
...
1465 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1466 {
1467 struct tcp_options_received tmp_opt;
1468 struct request_sock *req;
1469 struct inet_request_sock *ireq;
1470 struct tcp_sock *tp = tcp_sk(sk);
1471 struct dst_entry *dst = NULL;
1472 __be32 saddr = ip_hdr(skb)->saddr;
1473 __be32 daddr = ip_hdr(skb)->daddr;
1474 __u32 isn = TCP_SKB_CB(skb)->when;
1475 bool want_cookie = false;
1476 struct flowi4 fl4;
1477 struct tcp_fastopen_cookie foc = { .len = -1 };
1478 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1479 struct sk_buff *skb_synack;
1480 int do_fastopen;
...
1486 /* TW buckets are converted to open requests without
1487 * limitations, they conserve resources and peer is
1488 * evidently real one.
1489 */
1490 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {//如果requeset队列已满并且SYN包没有命中TIME_WAIT socekt
1491 want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); //判断是否需要发送SYN Cookie
1492 if (!want_cookie) //如果不需要发送Cookie,则丢弃
1493 goto drop;
1494 }
1495
1496 /* Accept backlog is full. If we have already queued enough
1497 * of warm entries in syn queue, drop request. It is better than
1498 * clogging syn queue with openreqs with exponentially increasing
1499 * timeout.
1500 */
1501 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1502 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1503 goto drop; //如果accept backlog队列已满,且未超时的request socket的数量大于1,则丢弃当前请求
1504 }
1505
1506 req = inet_reqsk_alloc(&tcp_request_sock_ops);//申请一个request socket结构体
1507 if (!req)
1508 goto drop;
1509
1514 tcp_clear_options(&tmp_opt);
1515 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1516 tmp_opt.user_mss = tp->rx_opt.user_mss;
1517 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1518
1519 if (want_cookie && !tmp_opt.saw_tstamp)
1520 tcp_clear_options(&tmp_opt);
1521
1522 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1523 tcp_openreq_init(req, &tmp_opt, skb);
1524
1525 ireq = inet_rsk(req);
1526 ireq->loc_addr = daddr;
1527 ireq->rmt_addr = saddr;
1528 ireq->no_srccheck = inet_sk(sk)->transparent;
1529 ireq->opt = tcp_v4_save_options(skb);
...
1536
1537 if (want_cookie) {
1538 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1539 req->cookie_ts = tmp_opt.tstamp_ok;
1540 } else if (!isn) {
1541 /* VJ's idea. We save last timestamp seen
1542 * from the destination in peer table, when entering
1543 * state TIME-WAIT, and check against it before
1544 * accepting new connection request.
1545 *
1546 * If "isn" is not zero, this request hit alive
1547 * timewait bucket, so that all the necessary checks
1548 * are made in the function processing timewait state.
1549 */
1550 if (tmp_opt.saw_tstamp && //SYN包中携带了时间戳选项
1551 tcp_death_row.sysctl_tw_recycle && //系统设置了回收TIME_WAIT socket
1552 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && //查找到目的路由
1553 fl4.daddr == saddr) { //路由得到的目的IP与SYN包中的源IP一致
1554 if (!tcp_peer_is_proven(req, dst, true)) {//这时time_wait bucket已经超时了,只能通过时间戳验证一下新的请求是否与旧连接冲突
1555 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1556 goto drop_and_release;
1557 }
1558 }
1559 /* Kill the following clause, if you dislike this way. */
1560 else if (!sysctl_tcp_syncookies && //SYN Cookie没有开启
1561 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1562 (sysctl_max_syn_backlog >> 2)) && //仅仅一个socket的request socket的数量就占据了SYN backlog队列最大空间的3/4还多
1563 !tcp_peer_is_proven(req, dst, false)) {//无法证明对端主机处于活动状态
...
1573 goto drop_and_release; //这时很有可能发生了SYN Flood攻击,需要丢弃
1574 }
1575
1576 isn = tcp_v4_init_sequence(skb);//生成随机序列号
1577 }
1578 tcp_rsk(req)->snt_isn = isn;
...
1598 skb_synack = tcp_make_synack(sk, dst, req,
1599 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); //构建一个SYN|ACK包
1600
1601 if (skb_synack) {
1602 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);//计算检验和
1603 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1604 } else
1605 goto drop_and_free;
1606
1607 if (likely(!do_fastopen)) {
1608 int err;
1609 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1610 ireq->rmt_addr, ireq->opt);
1611 err = net_xmit_eval(err); //检查返回值,如果不是“成功”或“拥塞通告”则丢弃此连接
1612 if (err || want_cookie)
1613 goto drop_and_free;
1614
1615 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1616 tcp_rsk(req)->listener = NULL;
1617 /* Add the request_sock to the SYN table */
1618 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
...
1633 return 0;
1634 }
代码解析:
1490和1501:限制未处理完毕的连接请求的数量,以避免未建立的连接消耗的内存过大
1506:申请一个request_sock结构用于保存连接信息;使用request_sock而不使用socket的原因是前者比后者小得多,会大大减少内存的占用
1523:tcp_openreq_init函数用于保存SYN请求中的信息:
1075 static inline void tcp_openreq_init(struct request_sock *req,1084:rcv_nxt是期望接收的下一个报文的起始序列号,这里必须是SYN包的seq + 1;发送ACK报文时rcv_nxt会被用作确认号
1076 struct tcp_options_received *rx_opt,
1077 struct sk_buff *skb)
1078 {
1079 struct inet_request_sock *ireq = inet_rsk(req);
1080
1081 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1082 req->cookie_ts = 0;
1083 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
1084 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
1085 tcp_rsk(req)->snt_synack = 0;
1086 req->mss = rx_opt->mss_clamp;
1087 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
1088 ireq->tstamp_ok = rx_opt->tstamp_ok;
1089 ireq->sack_ok = rx_opt->sack_ok;
1090 ireq->snd_wscale = rx_opt->snd_wscale;
1091 ireq->wscale_ok = rx_opt->wscale_ok;
1092 ireq->acked = 0;
1093 ireq->ecn_ok = 0;
1094 ireq->rmt_port = tcp_hdr(skb)->source;
1095 ireq->loc_port = tcp_hdr(skb)->dest;
1096 }
回到tcp_v4_conn_request函数:
1526-1529:将地址、选项等信息保存下来以便建立established连接时使用
1609-1610:填充SYN|ACK包的IP首部并发送;发送的SYN|ACK并没有保存到队列中,是为了防止发生SYN Flood攻击时耗费内存
1618:将这个request_sock加入到SYN hash表中并设置定时器,超时后会重发SYN|ACK或删除request_sock
下面来看看SYN|ACK的构建和发送的过程。SYN|ACK的构建使用的是tcp_make_synack:
2654 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,代码解析:
2655 struct request_sock *req,
2656 struct tcp_fastopen_cookie *foc)
2657 {
2658 struct tcp_out_options opts;
2659 struct inet_request_sock *ireq = inet_rsk(req);
2660 struct tcp_sock *tp = tcp_sk(sk);
2661 struct tcphdr *th;
2662 struct sk_buff *skb;
2663 struct tcp_md5sig_key *md5;
2664 int tcp_header_size;
2665 int mss;
2666
2667 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); //申请skb
2668 if (unlikely(!skb)) {
2669 dst_release(dst);
2670 return NULL;
2671 }
2672 /* Reserve space for headers. */
2673 skb_reserve(skb, MAX_TCP_HEADER);
2674
2675 skb_dst_set(skb, dst);
...
2682 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2683 __u8 rcv_wscale;
2684 /* Set this up on the first call only */
2685 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2686
2687 /* limit the window selection if the user enforce a smaller rx buffer */
2688 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2689 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2690 req->window_clamp = tcp_full_space(sk);
2691
2692 /* tcp_full_space because it is guaranteed to be the first packet */
2693 tcp_select_initial_window(tcp_full_space(sk),
2694 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2695 &req->rcv_wnd,
2696 &req->window_clamp,
2697 ireq->wscale_ok,
2698 &rcv_wscale,
2699 dst_metric(dst, RTAX_INITRWND));
2700 ireq->rcv_wscale = rcv_wscale;
2701 }
...
2710 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
2711 foc) + sizeof(*th);
2712
2713 skb_push(skb, tcp_header_size);
2714 skb_reset_transport_header(skb);
2715
2716 th = tcp_hdr(skb);
2717 memset(th, 0, sizeof(struct tcphdr));
2718 th->syn = 1;
2719 th->ack = 1;
2720 TCP_ECN_make_synack(req, th);
2721 th->source = ireq->loc_port;
2722 th->dest = ireq->rmt_port;
...
2726 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2727 TCPHDR_SYN | TCPHDR_ACK);
2728
2729 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2730 /* XXX data is queued and acked as is. No buffer/window check */
2731 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2732
2733 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2734 th->window = htons(min(req->rcv_wnd, 65535U));
2735 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2736 th->doff = (tcp_header_size >> 2);
...
2747 return skb;
2748 }
2693-2700:初始化通告窗口大小
2710-2711:构建选项字段
2716-2736:设置TCP头的各个字段;其中2731行代码,SYN|ACK的ack_seq用tcp_openreq_init函数保存的rcv_nxt来赋值,这个值是SYN报文的seq + 1
这样一个SYN|ACK就构建完毕了,而包的其余部分的构建和发送是靠ip_build_and_send_pkt来完成:
129 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,server端TCP在收到SYN请求后进行的处理总结如下:
130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
131 {
132 struct inet_sock *inet = inet_sk(sk);
133 struct rtable *rt = skb_rtable(skb);
134 struct iphdr *iph;
135
136 /* Build the IP header. */
137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
138 skb_reset_network_header(skb);
139 iph = ip_hdr(skb);
140 iph->version = 4; //版本
141 iph->ihl = 5; //首部长度
142 iph->tos = inet->tos; //TOS: type of service
143 if (ip_dont_fragment(sk, &rt->dst))
144 iph->frag_off = htons(IP_DF); //不分片
145 else
146 iph->frag_off = 0;
147 iph->ttl = ip_select_ttl(inet, &rt->dst); //TTL: time to live
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; //传输层协议号
151 ip_select_ident(iph, &rt->dst, sk);
152
153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0); //设置选项字段
156 }
157
158 skb->priority = sk->sk_priority;
159 skb->mark = sk->sk_mark;
160
161 /* Send it out. */
162 return ip_local_out(skb);
163 }
1、创建一个比较小的数据结构request_sock并保存连接信息
2、将request_sock加入到syn table中,以便ACK到来时能够找到相应的连接信息
3、创建SYN|ACK包并发送出去,同时设置重传定时器以避免SYN|ACK包丢失
发送完SYN|ACK后,server端就会等待接收client端处理完SYN|ACK后再发送的ACK报文。client端会如何处理SYN|ACK呢?server端又会如何处理ACK以完成三次握手流程呢?