linux 4.9内核,bbr的带宽估计问题。
一个正常的bbr流量图:
对应的ttl图形:
一个异常的bbr流量图:
可以看出,异常的bbr流量图,出现了一个很低的带宽,且稳定在这个带宽10s左右,而正常情况下,这个文件下载不应该超过10s,由于流量消耗大于流量的下载,导致了用户播放卡顿。
通过分析,我们确认了bbr在应对delay_ack时,出现带宽估计偏低的情况,比如正常的min_rtt是1.3ms,但是delay_ack的时候,是40ms左右,而由于probe_rtt需要10秒之后进行,那么对应的
bw乘以最小的rtt则处于偏低的状态,在稳定10s之后,bbr会主动探测rtt,
bbr探测rtt的时候,行为是什么样的呢?对于4.9内核版本来说,是最多发送4个mss包:
cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, )
从10.3s左右的时间的流量图来看,可以看到一个很明显的向下缺口,就是探测最小rtt的行为,每个rtt之后4个包,流量再次下降。
这个持续的时间为0.2s,也就是代码里面如下的描述:
/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
* periodically drain the bottleneck queue, to converge to measure the true
* min_rtt (unloaded propagation delay). This allows the flows to keep queues
* small (reducing queuing delay and packet loss) and achieve fairness among
* BBR flows.
*
* The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
* we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
* After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
* round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
* re-enter the previous mode. BBR uses 200ms to approximately bound the
* performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
*
* Note that flows need only pay 2% if they are busy sending over the last 10
* seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
* natural silences or low-rate periods within 10 seconds where the rate is low
* enough for long enough to drain its queue in the bottleneck. We pick up
* these min RTT measurements opportunistically with our min_rtt filter. :-)
*/
static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
虽然经过10s,根据探测rtt之后的min_rtt应该修正为40ms左右,会导致流量上升,但由于我们的分片文件已经播放卡顿超时,客户端会断链。
结论:如果下载的流媒体文件过小,不建议开启bbr。特别是收包的客户端进入delay_ack非常早,并且不怎么出现乱序和丢包的情况。此现象在4.14内核依然存在。
回到问题本身,问什么会出现带宽估计过低的行为,我们通过什么样的手段去复现这种行为,由于现场的机顶盒我们没法拿到,那么只能通过自己来模拟:
首先,构建一个获取文件的请求,http,然后被动接收发包就行,结果复现不了,也就是客户端这边的delay_ack并不会在bbr的startup阶段出现。
所以,我们反过来想,我们能不能让服务器端收到的ack进行聚集,在startup阶段就收到聚集ack呢,顺着这种思路,我们仿照gro,将收到的ack进行聚集,并在40ms左右将ack报文送上协议栈
,这样从服务器的角度,让他认为客户端的ack就是这种频率发回来。
然后,我们在服务器bbr相关代码地方加printk:
首先增加四个地方,就是bbr的状态切换的地方:
static void bbr_reset_startup_mode(struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_STARTUP;
if(((struct inet_sock*)sk)->inet_dport == )
{
printk("mode 2 mode=%d,min_rtt_us=%d,full_bw=%d,cycle_idx=%d,pacing_gain=%d,cwnd_gain=%d,rtt_cnt=%d\r\n",bbr->mode,bbr->min_rtt_us,bbr->full_bw,bbr->cycle_idx,bbr->pacing_gain,bbr->cwnd_gain,bbr->rtt_cnt);
} 。。。。
} static void bbr_reset_probe_bw_mode(struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW;
if(((struct inet_sock*)sk)->inet_dport == )
{
printk("mode 2 mode=%d,min_rtt_us=%d,full_bw=%d,cycle_idx=%d,pacing_gain=%d,cwnd_gain=%d,rtt_cnt=%d\r\n",bbr->mode,bbr->min_rtt_us,bbr->full_bw,bbr->cycle_idx,bbr->pacing_gain,bbr->cwnd_gain,bbr->rtt_cnt);
}
。。。。
} static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk); if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
if(((struct inet_sock*)sk)->inet_dport == )
{
printk("mode 2 mode=%d,min_rtt_us=%d,full_bw=%d,cycle_idx=%d,pacing_gain=%d,cwnd_gain=%d,rtt_cnt=%d\r\n",bbr->mode,bbr->min_rtt_us,bbr->full_bw,bbr->cycle_idx,bbr->pacing_gain,bbr->cwnd_gain,bbr->rtt_cnt);
} 。。。
} static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
bool filter_expired;
static int count_expire=; 。。。。。。if (rs->rtt_us >= &&
(rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_time_stamp;
} if (bbr_probe_rtt_mode_ms > && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
if(((struct inet_sock*)sk)->inet_dport == )
{
printk("mode 2 mode=%d,min_rtt_us=%d,full_bw=%d,cycle_idx=%d,pacing_gain=%d,cwnd_gain=%d,rtt_cnt=%d\r\n",bbr->mode,bbr->min_rtt_us,bbr->full_bw,bbr->cycle_idx,bbr->pacing_gain,bbr->cwnd_gain,bbr->rtt_cnt);
} bbr->pacing_gain = BBR_UNIT;
.....
}
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
bbr_update_model(sk, rs);
bw = bbr_bw(sk);
bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
bbr_set_tso_segs_goal(sk);
bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
if(((struct inet_sock*)sk)->inet_dport == 3000)
{
printk("main mode=%d,min_rtt_us=%d,cur_bw=%d,cycle_idx=%d,pacing_gain=%d,cwnd_gain=%d,rtt_cnt=%d,snd_cwnd=%d, snd_nxt=%u,snd_una=%u\r\n",bbr->mode,bbr->min_rtt_us,bbr_bw(sk),bbr->cycle_idx,bbr->pacing_gain,bbr->cwnd_gain,bbr->rtt_cnt,tcp_sk(sk)->snd_cwnd, tcp_sk(sk)->snd_nxt, tcp_sk(sk)->snd_una );
}
}
除了四个状态转换的地方,由于我们收到ack,都会进入bbr_main,所以我们也在这个地方加了打印。
打印结果如下:
[11241.360364] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=
[11241.360373] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.360377] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=
[11241.367057] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.369053] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.375055] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.383053] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.392054] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.401053] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.409054] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.417055] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.426086] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.434054] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11241.442055] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
。。。。。 中间只增加rtt_cnt的部分省略: [11243.420044] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.429045] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.437044] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.446046] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.454046] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.502075] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195------------rtt从8ms变成48ms
[11243.544064] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195-----------42ms
[11243.586056] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195------------42ms
[11243.628057] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195------------42ms
[11243.670058] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195------------42ms
[11243.753064] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=195------------83ms
[11243.795063] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=257----------------从startup模式切换到drain模式
[11243.795068] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=257----------------立刻从drain模式切换到探测bw模式
[11243.795072] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.837061] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.879063] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.921061] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11243.963063] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.005063] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.047065] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.089065] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.131066] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.173065] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.215069] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.257067] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.299067] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.341077] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.383073] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.425070] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.467071] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.509073] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.551076] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.593078] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.635078] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.677079] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.719083] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11244.761082] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
黄色颜色部分就是我们突然使用delay_ack的地方,这个可以通过前后两个ack的时间相差来确定。
可以看到,bbr反应相当剧烈,在一个RTO左右的时间,将snd_cwnd迅速降低,
ps:初始状态并不是0,这个并不是我们代码打印有问题,而是内核中真的就这么执行的,个人觉得是个bug,具体分析可以参见《https://www.cnblogs.com/10087622blog/p/10412440.html》。
我们复现的带宽图:
对应的rtt图:
怎么解释在10s附近,带宽增长很多呢?继续根据我们prink的数据来分析:
[11251.018193] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.060199] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.102199] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.144200] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.186198] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.228199] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.270202] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.312199] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.354205] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.396210] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=438------mode从探测bw到探测rtt
[11251.396215] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=4----取4个mss来发包
[11251.438203] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=4----稳定的min_rtt变成41442us
[11251.480205] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.522214] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.564202] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=4----探测持续200ms,符合代码
[11251.606206] mode mode=,min_rtt_us=,full_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=443-------mode从探测rtt回到探测bw
[11251.606215] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=12---开始指数增长我们的cwnd
[11251.648208] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.690214] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.732220] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.774230] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.816239] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.858244] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.900231] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.942238] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11251.984226] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.026236] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.068228] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.110231] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.152229] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.194235] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.236229] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.278264] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
[11252.320232] main mode=,min_rtt_us=,cur_bw=,cycle_idx=,pacing_gain=,cwnd_gain=,rtt_cnt=,snd_cwnd=
其他内核没有测试。遇到该问题的童鞋,可以尝试如下补丁:
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/net/ipv4/tcp_bbr.c?id=78dc70ebaa38aa303274e333be6c98eef87619e2
有测试好的补丁也欢迎讨论。
参考资料:
https://queue.acm.org/detail.cfm?id=3022184