Linux中的进程调度(五)

时间:2021-01-07 14:36:42
  前面主要分析的是新建一个进程时,内核在调度方面进行的处理。 现在来看当时钟中断时会发生的动作。 当时钟中断发生时,顺着中断处理程序调用路线,会调用到scheduler_tick()
/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
 * It also gets called by the fork code, when changing the parent's
 * timeslices.
 */
void scheduler_tick(void)
{
	int cpu = smp_processor_id();//得到当前cpu的id
	struct rq *rq = cpu_rq(cpu);//得到该id所对应的cpu所拥有的可运行队列,上述两个变量都是per_cpu变量(抽时间专门写一篇关于per_cpu变量的日志)
	struct task_struct *curr = rq->curr;//得到当前进程

	sched_clock_tick();//如果有定义CONFIG_HAVE_UNSTABLE_SCHED_CLOCK,则此函数为空,否则此函数会统计一些调度器信息,与调度特定功能关系不大,不作分析

	spin_lock(&rq->lock);//加自旋锁,因为整个系统可能不只有一个时钟,如果此时其它cpu空闲,则会从其它cpu的可运行队列中挑选进程"pull"过去,所以加锁防止竞争
	update_rq_clock(rq);//更新可运行队列的时间,具体实现太过于底层,大概是从cpu的某个寄存器中将晶振值读出
	update_cpu_load(rq);//更新cpu负载信息,具体实现好像没有很严格的逻辑,有点经验的味道,以后如果用到再回过来仔细分析
	curr->sched_class->task_tick(rq, curr, 0);//对于curr进程所在的sched_class,执行其对应的时钟处理函数
	spin_unlock(&rq->lock);//释放自旋锁

#ifdef CONFIG_SMP//与负载平衡有关,在后面重点分析负载平衡的日志中分析
	rq->idle_at_tick = idle_cpu(cpu);
	trigger_load_balance(rq, cpu);
#endif
}
对于属于cfs调度类的进程,其task_tick函数为task_tick_fair()
/*
 * scheduler tick hitting a task of our scheduling class:
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {//与组调度有关,最新版本的内核已经抛弃了组调度的概念,这里可以认为这个循环只执行一次
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se, queued);//真正的处理是这个函数完成的
	}
}
 
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);//更新当前进程的虚拟时间,在前面的日志中已经有过分析

#ifdef CONFIG_SCHED_HRTICK//忽略
	/*
	 * queued ticks are scheduled to match the slice, so don't bother
	 * validating it and just reschedule.
	 */
	if (queued) {
		resched_task(rq_of(cfs_rq)->curr);
		return;
	}
	/*
	 * don't let the period tick interfere with the hrtick preemption
	 */
	if (!sched_feat(DOUBLE_TICK) &&
			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
#endif

	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))//检查当前进程是否可以被抢占
		check_preempt_tick(cfs_rq, curr);
}
接下来到重点了
/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;

	ideal_runtime = sched_slice(cfs_rq, curr);//计算当前进程应该运行的实际时间(wall-time)
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime)//如果运行时间已经超过分配值,置调度位,在时钟中断返回用户态时将执行调度
		resched_task(rq_of(cfs_rq)->curr);
}
时钟中断经常发生,将每个进程所应得的运行时间直接存入进程结构体不行么?不是会更省时间么?是的,以前的内核中确实是这样做的,可是这样做有一个缺点,就是一个进程的可运行时间在创建之初就已被确定(O(1)调度算法中虽然可以根据进程行为启发的去调整时间片,但本质上还没有解决问题),每次都去计算的方法,则会根据队列中的进程数量,动态的去分配时间,更为合理(因为如果进程数目一直在上升,显然调度周期需要适当的缩短)。至于时间的计算,比较简单,只给出代码,不作注释,但是,这里为什么要用实际时间呢?用虚拟时间不行么?
/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
 * s = p*P[w/rw]
 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	unsigned long nr_running = cfs_rq->nr_running;

	if (unlikely(!se->on_rq))
		nr_running++;

	return calc_delta_weight(__sched_period(nr_running), se);
}
/*
 * delta *= P[w / rw]
 */
static inline unsigned long
calc_delta_weight(unsigned long delta, struct sched_entity *se)
{
	for_each_sched_entity(se) {
		delta = calc_delta_mine(delta,
				se->load.weight, &cfs_rq_of(se)->load);
	}

	return delta;
}
可见中断处理的过程其实很简单,主要就是更新当前进程的运行时间信息,然后与理想值进行比较,如果发现已经超时就置调度位,待从内核态回到用户态时执行调度。 再来看若进程主动调用schedule放弃cpu,也就是显式调用时的动作。
/*
 * schedule() is the main scheduler function.
 */
asmlinkage void __sched schedule(void)
{
	struct task_struct *prev, *next;
	unsigned long *switch_count;
	struct rq *rq;
	int cpu;

need_resched:
	preempt_disable();
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	rcu_qsctr_inc(cpu);//这个没弄清楚是什么作用,不过可以知道是个per cpu变量,查了下资料也没有查到啥,希望有高人解释
	prev = rq->curr;
	switch_count = &prev->nivcsw;

	release_kernel_lock(prev);
need_resched_nonpreemptible:

	schedule_debug(prev);

	if (sched_feat(HRTICK))
		hrtick_clear(rq);

	spin_lock_irq(&rq->lock);
	update_rq_clock(rq);
	clear_tsk_need_resched(prev);

	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		if (unlikely(signal_pending_state(prev->state, prev)))//如果有信号发送到当前进程,那么再给它一次机会,先不将它移出可运行队列
			prev->state = TASK_RUNNING;
		else
			deactivate_task(rq, prev, 1);//将进程移出可运行队列
		switch_count = &prev->nvcsw;
	}

#ifdef CONFIG_SMP
	if (prev->sched_class->pre_schedule)//对于cfs调度类,此函数不存在
		prev->sched_class->pre_schedule(rq, prev);
#endif

	if (unlikely(!rq->nr_running))//如果队列中已经没有进程,那么执行一次平衡操作
		idle_balance(cpu, rq);

	prev->sched_class->put_prev_task(rq, prev);//
	next = pick_next_task(rq, prev);

	if (likely(prev != next)) {
		sched_info_switch(prev, next);

		rq->nr_switches++;
		rq->curr = next;
		++*switch_count;

		context_switch(rq, prev, next); /* unlocks the rq */
		/*
		 * the context switch might have flipped the stack from under
		 * us, hence refresh the local variables.
		 */
		cpu = smp_processor_id();
		rq = cpu_rq(cpu);
	} else
		spin_unlock_irq(&rq->lock);

	if (unlikely(reacquire_kernel_lock(current) < 0))
		goto need_resched_nonpreemptible;

	preempt_enable_no_resched();
	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
		goto need_resched;
}
EXPORT_SYMBOL(schedule);
deactivate_task的代码如下:
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
     if (p->state == TASK_UNINTERRUPTIBLE)
         rq->nr_uninterruptible++;//更新统计信息

     dequeue_task(rq, p, sleep);//调用对应调度类的相应函数,将进程移出
     dec_nr_running(p, rq);//更新可运行进程数目
 }
dequeue_task的代码如下:
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
     if (sleep && p->se.last_wakeup) {//等分析被唤醒时的情景时再来看
         update_avg(&p->se.avg_overlap,
                p->se.sum_exec_runtime - p->se.last_wakeup);
         p->se.last_wakeup = 0;
     }

     sched_info_dequeued(p);//条件编译,对调度特性无影响
     p->sched_class->dequeue_task(rq, p, sleep);//真正的要删除了
     p->se.on_rq = 0;//标记为该进程已不在可运行队列中
 }
对于cfs调度类,dequeue_task所对应的函数为dequeue_task_fair(从命名规则也可以猜出应该这个这函数:)
/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	for_each_sched_entity(se) {//如果不考虑组调度的情况(新版本已经没有组调度的概念),这个循环只会执行一次
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, sleep);..终于找到了删除动作
		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight)
			break;
		sleep = 1;
	}

	hrtick_update(rq);
}
继续往里走,看dequeue_entity的代码
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);//更新运行信息

	update_stats_dequeue(cfs_rq, se);//更新进程的一些信息,如果用到再仔细分析
	if (sleep) {
#ifdef CONFIG_SCHEDSTATS
		if (entity_is_task(se)) {
			struct task_struct *tsk = task_of(se);

			if (tsk->state & TASK_INTERRUPTIBLE)
				se->sleep_start = rq_of(cfs_rq)->clock;
			if (tsk->state & TASK_UNINTERRUPTIBLE)
				se->block_start = rq_of(cfs_rq)->clock;
		}
#endif
	}

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	account_entity_dequeue(cfs_rq, se);//主要更新负载信息
	update_min_vruntime(cfs_rq);
}
clear_buddies这个函数会和别的概念有联系,等下单独分析。 重点看这句
if (se != cfs_rq->curr)
        __dequeue_entity(cfs_rq, se);
嗯?不对吧?se不就是前面传过来的的prev么?也就是当前cpu上正在运行的进程(不一定就是cfs里的current进程),如果不是,才将其从红黑树里移除? 删除之后,更新cpu的负载信息,更新cfs队列的负载信息,更新队列的可运行进程数目,将被删除进程标记为已不在可运行队列中
  static void
  account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
      update_load_sub(&cfs_rq->load, se->load.weight);
      if (!parent_entity(se))
          dec_cpu_load(rq_of(cfs_rq), se->load.weight);
      if (entity_is_task(se)) {
          add_cfs_task_weight(cfs_rq, -se->load.weight);
          list_del_init(&se->group_node);
      }
      cfs_rq->nr_running--;
      se->on_rq = 0;
  }
转了这么远后,回来继续schedule()函数的执行,也就是说,将当前进程移出可运行队列后,如果队列为空将执行一次cpu之间的负载平衡操作,然后执行以下语句
prev->sched_class->put_prev_task(rq, prev);
同样,对于cfs调度类,put_prev_task对应put_prev_task_fair,  
/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
	struct sched_entity *se = &prev->se;
	struct cfs_rq *cfs_rq;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		put_prev_entity(cfs_rq, se);
	}
}
同样忽略组调度,进入put_prev_entity
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
	/*
	 * If still on the runqueue then deactivate_task()
	 * was not called and update_curr() has to be done:
	 */
	if (prev->on_rq)
		update_curr(cfs_rq);

	check_spread(cfs_rq, prev);
	if (prev->on_rq) {
		update_stats_wait_start(cfs_rq, prev);
		/* Put 'current' back into the tree. */
		__enqueue_entity(cfs_rq, prev);
	}
	cfs_rq->curr = NULL;
按照前面的分析,在这个情景中,这时prev->on_rq已经为零了,所以这个函数实际上只是把cfs_rq->currl置空。 再次回到schedule的主路线中,当前进程已经被移走了,接下来就要选择一个新的进程来执行了
  prev->sched_class->put_prev_task(rq, prev);
  next = pick_next_task(rq, prev);
 
/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
	const struct sched_class *class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in
	 * the fair class we can call that function directly:
	 */
	if (likely(rq->nr_running == rq->cfs.nr_running)) {
		p = fair_sched_class.pick_next_task(rq);
		if (likely(p))
			return p;
	}

	class = sched_class_highest;
	for ( ; ; ) {
		p = class->pick_next_task(rq);
		if (p)
			return p;
		/*
		 * Will never be NULL as the idle class always
		 * returns a non-NULL p:
		 */
		class = class->next;
	}
}
代码注释写的非常明白,我们进入到cfs调度类的代码中去看一下是怎么pick up的(其函数名想必已经能猜出来了:)
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
	struct task_struct *p;
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;

	if (unlikely(!cfs_rq->nr_running))
		return NULL;

	do {
		se = pick_next_entity(cfs_rq);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);
	hrtick_start_fair(rq, p);

	return p;
}
再来看pick_next_entity
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct sched_entity *se = __pick_next_entity(cfs_rq);

	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)//这个留着下面分析
		return cfs_rq->next;

	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
		return cfs_rq->last;

	return se;
}!
居然还有更深一层~
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
}
唔,果然是返回红黑树的最左结点 选择到最左叶子结点后,执行set_next_entity
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end(cfs_rq, se);
		__dequeue_entity(cfs_rq, se);
	}

	update_stats_curr_start(cfs_rq, se);
	cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
	/*
	 * Track our maximum slice length, if the CPU's load is at
	 * least twice that of our own weight (i.e. dont track it
	 * when there are only lesser-weight tasks around):
	 */
	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
		se->slice_max = max(se->slice_max,
			se->sum_exec_runtime - se->prev_sum_exec_runtime);
	}
#endif
	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
哦,将他从树中拿出来了,并更新它的prev_sum_exec_runtime信息,(还记得吧,在时钟中断的处理中,被调入后,当前进程的运行时间就是用now值减去这个prev_sum_exec_runtime的,然后判断是否需要被换出) 终于,pick up的工作完成了,旧的进程也已经从队列中移除了,下面可以完成交接工作,来让新的进程运行了,由于其与调度器特性无关,我们暂不分析。   分析了新建进程,进程主动放弃cpu,现在只剩下一种情景没分析了,那就是被唤醒,其实在那种情景下,如果不考虑SMP平衡负载的话,其实只做了一件事,就是将所对应进程加入可运行队列,更新可运行进程数,更新负载,task state置TASK_RUNNING,然后检查是否可以抢占当前进程,如是则进行置位。就是这样而已,相当简单,只给出片断:
	update_rq_clock(rq);
	activate_task(rq, p, 1);
	success = 1;

out_running:
	trace_sched_wakeup(rq, p);
	check_preempt_curr(rq, p, sync);

	p->state = TASK_RUNNING;