前面主要分析的是新建一个进程时,内核在调度方面进行的处理。 现在来看当时钟中断时会发生的动作。 当时钟中断发生时,顺着中断处理程序调用路线,会调用到scheduler_tick()
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();//得到当前cpu的id
struct rq *rq = cpu_rq(cpu);//得到该id所对应的cpu所拥有的可运行队列,上述两个变量都是per_cpu变量(抽时间专门写一篇关于per_cpu变量的日志)
struct task_struct *curr = rq->curr;//得到当前进程
sched_clock_tick();//如果有定义CONFIG_HAVE_UNSTABLE_SCHED_CLOCK,则此函数为空,否则此函数会统计一些调度器信息,与调度特定功能关系不大,不作分析
spin_lock(&rq->lock);//加自旋锁,因为整个系统可能不只有一个时钟,如果此时其它cpu空闲,则会从其它cpu的可运行队列中挑选进程"pull"过去,所以加锁防止竞争
update_rq_clock(rq);//更新可运行队列的时间,具体实现太过于底层,大概是从cpu的某个寄存器中将晶振值读出
update_cpu_load(rq);//更新cpu负载信息,具体实现好像没有很严格的逻辑,有点经验的味道,以后如果用到再回过来仔细分析
curr->sched_class->task_tick(rq, curr, 0);//对于curr进程所在的sched_class,执行其对应的时钟处理函数
spin_unlock(&rq->lock);//释放自旋锁
#ifdef CONFIG_SMP//与负载平衡有关,在后面重点分析负载平衡的日志中分析
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
}
对于属于cfs调度类的进程,其task_tick函数为task_tick_fair()
/*
* scheduler tick hitting a task of our scheduling class:
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {//与组调度有关,最新版本的内核已经抛弃了组调度的概念,这里可以认为这个循环只执行一次
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);//真正的处理是这个函数完成的
}
}
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);//更新当前进程的虚拟时间,在前面的日志中已经有过分析
#ifdef CONFIG_SCHED_HRTICK//忽略
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))//检查当前进程是否可以被抢占
check_preempt_tick(cfs_rq, curr);
}
接下来到重点了
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
ideal_runtime = sched_slice(cfs_rq, curr);//计算当前进程应该运行的实际时间(wall-time)
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime)//如果运行时间已经超过分配值,置调度位,在时钟中断返回用户态时将执行调度
resched_task(rq_of(cfs_rq)->curr);
}
时钟中断经常发生,将每个进程所应得的运行时间直接存入进程结构体不行么?不是会更省时间么?是的,以前的内核中确实是这样做的,可是这样做有一个缺点,就是一个进程的可运行时间在创建之初就已被确定(O(1)调度算法中虽然可以根据进程行为启发的去调整时间片,但本质上还没有解决问题),每次都去计算的方法,则会根据队列中的进程数量,动态的去分配时间,更为合理(因为如果进程数目一直在上升,显然调度周期需要适当的缩短)。至于时间的计算,比较简单,只给出代码,不作注释,但是,这里为什么要用实际时间呢?用虚拟时间不行么?
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
if (unlikely(!se->on_rq))
nr_running++;
return calc_delta_weight(__sched_period(nr_running), se);
}
/*
* delta *= P[w / rw]
*/
static inline unsigned long
calc_delta_weight(unsigned long delta, struct sched_entity *se)
{
for_each_sched_entity(se) {
delta = calc_delta_mine(delta,
se->load.weight, &cfs_rq_of(se)->load);
}
return delta;
}
可见中断处理的过程其实很简单,主要就是更新当前进程的运行时间信息,然后与理想值进行比较,如果发现已经超时就置调度位,待从内核态回到用户态时执行调度。 再来看若进程主动调用schedule放弃cpu,也就是显式调用时的动作。
/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
int cpu;
need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_qsctr_inc(cpu);//这个没弄清楚是什么作用,不过可以知道是个per cpu变量,查了下资料也没有查到啥,希望有高人解释
prev = rq->curr;
switch_count = &prev->nivcsw;
release_kernel_lock(prev);
need_resched_nonpreemptible:
schedule_debug(prev);
if (sched_feat(HRTICK))
hrtick_clear(rq);
spin_lock_irq(&rq->lock);
update_rq_clock(rq);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev)))//如果有信号发送到当前进程,那么再给它一次机会,先不将它移出可运行队列
prev->state = TASK_RUNNING;
else
deactivate_task(rq, prev, 1);//将进程移出可运行队列
switch_count = &prev->nvcsw;
}
#ifdef CONFIG_SMP
if (prev->sched_class->pre_schedule)//对于cfs调度类,此函数不存在
prev->sched_class->pre_schedule(rq, prev);
#endif
if (unlikely(!rq->nr_running))//如果队列中已经没有进程,那么执行一次平衡操作
idle_balance(cpu, rq);
prev->sched_class->put_prev_task(rq, prev);//
next = pick_next_task(rq, prev);
if (likely(prev != next)) {
sched_info_switch(prev, next);
rq->nr_switches++;
rq->curr = next;
++*switch_count;
context_switch(rq, prev, next); /* unlocks the rq */
/*
* the context switch might have flipped the stack from under
* us, hence refresh the local variables.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
} else
spin_unlock_irq(&rq->lock);
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
EXPORT_SYMBOL(schedule);
deactivate_task的代码如下:
/*
* deactivate_task - remove a task from the runqueue.
*/
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
{
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;//更新统计信息
dequeue_task(rq, p, sleep);//调用对应调度类的相应函数,将进程移出
dec_nr_running(p, rq);//更新可运行进程数目
}
dequeue_task的代码如下:
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
if (sleep && p->se.last_wakeup) {//等分析被唤醒时的情景时再来看
update_avg(&p->se.avg_overlap,
p->se.sum_exec_runtime - p->se.last_wakeup);
p->se.last_wakeup = 0;
}
sched_info_dequeued(p);//条件编译,对调度特性无影响
p->sched_class->dequeue_task(rq, p, sleep);//真正的要删除了
p->se.on_rq = 0;//标记为该进程已不在可运行队列中
}
对于cfs调度类,dequeue_task所对应的函数为dequeue_task_fair(从命名规则也可以猜出应该这个这函数:)
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {//如果不考虑组调度的情况(新版本已经没有组调度的概念),这个循环只会执行一次
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);..终于找到了删除动作
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
sleep = 1;
}
hrtick_update(rq);
}
继续往里走,看dequeue_entity的代码
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);//更新运行信息
update_stats_dequeue(cfs_rq, se);//更新进程的一些信息,如果用到再仔细分析
if (sleep) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->block_start = rq_of(cfs_rq)->clock;
}
#endif
}
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);//主要更新负载信息
update_min_vruntime(cfs_rq);
}
clear_buddies这个函数会和别的概念有联系,等下单独分析。 重点看这句
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
嗯?不对吧?se不就是前面传过来的的prev么?也就是当前cpu上正在运行的进程(不一定就是cfs里的current进程),如果不是,才将其从红黑树里移除? 删除之后,更新cpu的负载信息,更新cfs队列的负载信息,更新队列的可运行进程数目,将被删除进程标记为已不在可运行队列中
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
se->on_rq = 0;
}
转了这么远后,回来继续schedule()函数的执行,也就是说,将当前进程移出可运行队列后,如果队列为空将执行一次cpu之间的负载平衡操作,然后执行以下语句
prev->sched_class->put_prev_task(rq, prev);
同样,对于cfs调度类,put_prev_task对应put_prev_task_fair,
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
同样忽略组调度,进入put_prev_entity
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
}
cfs_rq->curr = NULL;
按照前面的分析,在这个情景中,这时prev->on_rq已经为零了,所以这个函数实际上只是把cfs_rq->currl置空。 再次回到schedule的主路线中,当前进程已经被移走了,接下来就要选择一个新的进程来执行了
prev->sched_class->put_prev_task(rq, prev);
next = pick_next_task(rq, prev);
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.nr_running)) {
p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
}
class = sched_class_highest;
for ( ; ; ) {
p = class->pick_next_task(rq);
if (p)
return p;
/*
* Will never be NULL as the idle class always
* returns a non-NULL p:
*/
class = class->next;
}
}
代码注释写的非常明白,我们进入到cfs调度类的代码中去看一下是怎么pick up的(其函数名想必已经能猜出来了:)
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
if (unlikely(!cfs_rq->nr_running))
return NULL;
do {
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
hrtick_start_fair(rq, p);
return p;
}
再来看pick_next_entity
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)//这个留着下面分析
return cfs_rq->next;
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
return cfs_rq->last;
return se;
}!
居然还有更深一层~
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
唔,果然是返回红黑树的最左结点 选择到最左叶子结点后,执行set_next_entity
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->slice_max = max(se->slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
哦,将他从树中拿出来了,并更新它的prev_sum_exec_runtime信息,(还记得吧,在时钟中断的处理中,被调入后,当前进程的运行时间就是用now值减去这个prev_sum_exec_runtime的,然后判断是否需要被换出) 终于,pick up的工作完成了,旧的进程也已经从队列中移除了,下面可以完成交接工作,来让新的进程运行了,由于其与调度器特性无关,我们暂不分析。 分析了新建进程,进程主动放弃cpu,现在只剩下一种情景没分析了,那就是被唤醒,其实在那种情景下,如果不考虑SMP平衡负载的话,其实只做了一件事,就是将所对应进程加入可运行队列,更新可运行进程数,更新负载,task state置TASK_RUNNING,然后检查是否可以抢占当前进程,如是则进行置位。就是这样而已,相当简单,只给出片断:
update_rq_clock(rq);
activate_task(rq, p, 1);
success = 1;
out_running:
trace_sched_wakeup(rq, p);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;