TCP的核心系列 — 重传队列的更新和时延的采样（一）

重传队列实际上就是发送队列(sk->sk_write_queue)，保存着发送且未确认的数据段。

当有新的数据段被确认时，需要把这些段从重传队列中删除，同时更新一些变量，包括

packets_out、sacked_out、lost_out、retrans_out等。

对于非重复的ACK，会进行RTT采样，用于更新srtt和rto等时延信息。

本文主要内容：tcp_clean_rtx_queue()的实现。

内核版本：3.2.12

Author：zhangskd @ ****

函数实现

Q：什么是重传队列？

A：重传队列实际上就是发送队列(sk->sk_write_queue)，保存着发送且未确认的数据段。

The retransmit queue is implemented using a linked list to hold all the packets currently in

flight to the receiver.

Q：tcp_clean_rtx_queue()是干什么的？

A：tcp_clean_rtx_queue() is called to remove and free the acknowledged packets from the

retransmit queue, and packets_out is decremented by the number of freed packets.

/* Remove acknowledged frames from the retransmission queue.

 * If our packet is before the ack sequence we can discard it as it's confirmed to

 * have arrived at the other end.

 */

static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una)

{

    struct tcp_sock *tp = tcp_sk(sk);

    const struct inet_connection_sock *icsk = inet_csk(sk);

    struct sk_buff *skb;

    u32 now = tcp_time_stamp; /* 当前时间，用于计算RTT */

    int fully_acked = 1; /* 表示数据段是否完全被确认 */

    int flag = 0;

    u32 pkts_acked = 0;

    u32 reord = tp->packets_out;

    u32 prior_sacked = tp->sacked_out;

    s32 seq_rtt = -1;

    s32 ca_seq_rtt = -1;

    ktime_t last_ackt = net_invalid_timestamp(); /* 把last_ackt置为0*/

    /* 遍历发送队列sk_write_queue

      * 注意：遍历到snd_una即停止，也就是说如果snd_una没更新，那么这个循环马上就退出！

      */

    while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {

        struct tcp_skb_cb *scb = TCP_SKB_CB(skb);

        u32 acked_pcount;

        u32 sacked = scb->sacked; /* 记分板scoreboard */

        /* Determine how many packets and what bytes were acked, tso and else.

         * tp->snd_una已经是更新过的了，所以从发送队列头到snd_una就是此ACK确认的数据量。

         */

        if (after(scb->end_seq, tp->snd_una)) {

            /* 如果没有使用TSO，或seq >= snd_una，那么就退出遍历*/

            if (tcp_skb_pcount(skb) == 1 || ! after(tp->snd_una, scb->seq))

                break;

            /* 如果只确认了TSO段中的一部分，则截掉确认的部分，并统计确认了多少段*/

            acked_pcount = tcp_tso_acked(sk, skb);

            if (! acked_pcount) /* 处理出错 */

                break;

            fully_acked = 0; /* 表示没有确认完TSO段*/

        } else {

            acked_pcount = tcp_skb_pcount(skb); /* 统计确认段的个数*/

        }

        /* 如果此段被重传过*/

        if (sacked & TCPCB_RETRANS) {

            if (sacked & TCPCB_SACKED_RETRANS) /* 之前重传了还没有恢复*/

                tp->retrans_out -= acked_pcount; /* 更新网络中重传且未确认段的数量*/

            flag |= FLAG_RETRANS_DATA_ACKED;

            ca_seq_rtt = -1;

            seq_rtt = -1;

            if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))

                flag |= FLAG_NONHEAD_RETRANS_ACKED;

        } else { /* 如果此段没有被重传过*/

            ca_seq_rtt = now - scb->when; /* 通过此ACK计算skb的RTT采样值*/

            last_ackt = skb->tstamp; /* 获取此skb的发送时间，可以精确到纳秒！*/

            if (seq_rtt < 0) {

                seq_rtt = ca_seq_rtt;

            }

            /* 如果SACK块中有空洞，那么保存其中序号最小号的 */

            if (! (sacked & TCPCB_SACKED_ACKED))

                reord = min(pkts_acked, reord);

        }

        /* 如果skb之前是带有SACK标志 */

        if (sacked & TCPCB_SACKED_ACKED)

            tp->sacked_out -= acked_pcount; /* 更新sacked_out */

        /* 如果skb之前是带有LOST标志 */

        if (sacked & TCPCB_LOST)

            tp->lost_out -= acked_pcount; /* 更新lost_out */

        tp->packets_out -= acked_pcount; /* 更新packets_out */

        pkts_acked += acked_pcount; /* 累加此ACK确认的数据量*/

        /* Initial outgoing SYN's get put onto the write_queue just like anything else

         * we transmit. It is not true data, and if we misinform our callers that this ACK

         * acks real data, we will erroneously exit connection startup slow start one packet

         * too quickly. This is severely frowned upon behavior.

         */

        if (! (scb->flags & TCPHDR_SYN)) {

            flag |= FLAG_DATA_ACKED; /* 确认了新的数据 */

        } else {

            flag |= FLAG_SYN_ACKED; /* 确认了SYN段 */

            tp->retrans_stamp = 0; /* Clear the stamp of the first SYN */

        }

        if (! fully_acked) /* 如果TSO段没被完全确认，则到此为止*/

            break; 

        tcp_unlink_write_queue(skb, sk); /* 从发送队列上移除skb */

        sk_wmem_free_skb(sk, skb); /* 删除skb的内存对象*/

        tp->scoreboard_skb_hint = NULL;

        if (skb == tp->retransmit_skb_hint)

            tp->retransmit_skb_hint = NULL;

         if (skb  == tp->lost_skb_hint)

             tp->lost_skb_hint = NULL;

    } /* 退出循环了这里*/

    if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))

        tp->snd_up = tp->snd_una; /* 更新Urgent pointer */

    if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))

        flag |= FLAG_SACK_RENEGING; /* 虚假的SACK */

    /* 如果此ACK确认了新数据，使snd_una前进了*/

    if (flag & FLAG_ACKED) {

        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;

        /* 如果路径MTU的探测段被确认了*/

        if (unlikely(icsk->icsk_mtup.probe_size &&

             ! after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {

            tcp_mtup_probe_success(sk);

        }

        /* 更新srtt、RTO等RTT相关变量*/

        tcp_ack_update_rtt(sk, flag, seq_rtt);

        tcp_rearm_rto(sk); /* 重置超时重传定时器*/

        if (tcp_is_reno(tp)) {

            /* Reno模拟SACK处理，更新tp->sacked_out。

             * 如果检测到乱序，更新tp->reordering。

             */

            tcp_remove_reno_sacks(sk, pkts_acked);

        } else {

            int delta;

            /* Non-retransmitted hole got filled? That's reordering。

             * 如果之前没有SACK，prior_fackets为0，不会更新。

             */

            if (reord < prior_fackets)

                tcp_update_reordering(sk, tp->fackets_out - reord, 0); /* 更新乱序队列大小*/

            delta = tcp_is_fack(tp) ? pkts_acked : prior_sacked - tp->sacked_out;

            tp->lost_cnt_hint = -= min(tp->lost_cnt_hint, delta);

        }

        tp->fackets_out -= min(pkts_acked, tp->fackets_out); /* 更新fackets_out */

        /* 如果定义了pkts_acked()钩子*/

        if (ca_ops->pkts_acked) {

            s32 rtt_us = -1;

            /* Is the ACK triggering packet unambiguous?，确认了非重传的数据段 */

            if (! (flag & FLAG_RETRANS_DATA_ACKED)) {

                /* High resolution needed and available?

                 * 高精确度的RTT测量，可以精确到微秒！

                 */

                 if (ca_ops->flags & TCP_CONG_RTT_STAMP &&

                      ! ktime_equal(last_ackt, net_invalid_timestamp()))

                     rtt_us = ktime_us_delta(ktime_get_real(), last_ackt);

                 else if (ca_seq_rtt >=0) /* 普通测量，精确到毫秒，再转为微秒*/

                     rtt_us = jiffies_to_usecs(ca_seq_rtt);

            }

            ca_ops->pkts_acked(sk, pkts_acked, rtt_us); /* 我们可以自定义的 */

        }

    } 

#if FASTRETRANS_DEBUG > 0

    WARN_ON((int) tp->sacked_out < 0);

    WARN_ON((int) tp->lost_out < 0);

    WARN_ON((int) tp->retrans_out < 0);

    if (! tp->packets_out && tcp_is_sack(tp)) {

        icsk = inet_csk(sk);

        if (tp->lost_out) {

            printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state);

            tp->lost_out = 0;

        }

        if (tp->sacked_out) {

            printk(KERN_DEBUG "Leak s=%u %d\n", tp->sacked_out, icsk->icsk_ca_state);

            tp->sacked_out = 0;

        }

        if (tp->retrans_out) {

            printk(KERN_DEBUG "Leak r=%u %d\n", tp->retrans_out, icsk->icsk_ca_state);

            tp->retrans_out = 0;

        }

    }

#endif

    return flag;

}

ktime_t

/*

 * ktime_t

 * On 64-bit CPUs a single 64-bit variable is used to store the hrtimes internal

 * representation of time values in scalar nanoseconds. The design plays out

 * best on 64-bit CPUs, where most conversions are NOPs and most arithmetic

 * ktime_t operations are plain arithmetic operations.

 *

 * On 32-bit CPUs an optimized representation of the timespec structure is used to

 * avoid expensive conversions from and to timespecs. The endian-aware order of

 * the tv struct members is chosen to allow mathematical operations on the tv64

 * member of the union too, which for certain operations produces better code.

 *

 * For architectures with efficient support for 64/32-bit conversions the plain scalar

 * nanosecond based representation can be selected by the config switch

 * CONFIG_KTIME_SCALAR.

 */

union ktime {

    s64 tv64;

#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)

    struct {

#ifdef __BIG_ENDIAN

    s32 sec, nsec;

#else

    s32 nsec, sec;

#endif

    } tv;

#endif

};

typedef union ktime ktime_t;

/* 返回值为0的ktime_t*/

static inline ktime_t net_invalid_timestamp(void)

{

    return ktime_set(0, 0);

}

TSO

当TSO段不是整个被确认，而是被确认一部分时，那么就分割TSO段，返回确认的段数。

/* If we get here, the whole TSO packet has not been acked. */

static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)

{

    struct tcp_sock *tp = tcp_sk(sk);

    u32 packets_acked;

    BUG_ON(! after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));

    packets_acked = tcp_skb_pcount(skb); /* tso段总共包括多少个段*/

    if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))

        return 0;

    packets_acked -= tcp_skb_pcount(skb); /* 减去未确认的段*/

    if (packets_acked) {

        BUG_ON(tcp_skb_pcount(skb) == 0);

        BUG_ON(! before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));

    }

    return packets_acked; /* 返回确认的段数 */

}

秒客网

TCP的核心系列 — 重传队列的更新和时延的采样（一）

函数实现

ktime_t

TSO

相关文章