linux内核定时器

linux的定时器一般分为两种，一种是timeout类型，也就是在指定时间之前完成相应的任务即可，这种定时器对精度要求较低，晚几毫秒执行不会有很大的影响，而且一般这种类型的定时器要处理的任务在超时之前就已经完成，并且从定时器的队列中删除了，用不着真正的等到timeout然后由定时器模块来处理，这种较低精度要求的定时器一般使用timer wheel定时器。另一种类型的就是timer类型定时器，这就要求必须在指定的时间执行相应的任务，因此精度要求较高，这种场合一般适用高精度的定时器hrtimer。

timer wheel和hrtimer使用两种不同的机制实现定时器，timer wheel使用jiffies为基准来判断任务是否过期，由于jiffies计数系统的节拍，系统每次时钟中断都会将这个值加1，系统每秒的时钟中断的次数为HZ（宏定义的一个常量，一般为100），因此jiffies为timer wheel定时器提供了毫秒级的精度。而hrtimer需要高精度的时钟设备，为系统提供纳秒级的定时器。这两者都通过软中断来驱动，timer wheel定时器通过软中断TIMER_SOFTIRQ，而hrtimer通过HRTIMER_SOFTIRQ来相应定时器。

1. timer wheel定时器

timer wheel定时器的请求通过struct timer_list来抽象，然后按照定时器的过期时间和基准时间的差值将其组织在双链表中，且相同过期时间的定时器放在同一个链表中，当响应软中断时，则将过期时间在当前时间之前的定时器全部删除，并且执行相应的回调函数。

struct timer_list {
	/*
	 * All fields that change during normal runtime grouped to the
	 * same cacheline
	 */
	struct list_head entry; //双链表的节点
	unsigned long expires; //过期时间
	struct tvec_base *base; /*由于定时器的基准时间不会随着jiffies的值实时更新，这个为定时器提供了基准时间，并且组织所有的在这个base上的timer_list对象，从下面tvec_base定义可以看到是缓冲区对齐，因此base变量的最后一位肯定是0，可以用这一位来表示其他信息，当base最后一位为1表示此定时器是deferrable的，可以延迟一定时间执行*/

	void (*function)(unsigned long); //回调函数和回调函数的参数
	unsigned long data;

	int slack;

#ifdef CONFIG_TIMER_STATS //统计相关
	int start_pid;
	void *start_site;
	char start_comm[16];
#endif
#ifdef CONFIG_LOCKDEP
	struct lockdep_map lockdep_map;
#endif
};

#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)#define TVN_SIZE (1 << TVN_BITS)#define TVR_SIZE (1 << TVR_BITS)#define TVN_MASK (TVN_SIZE - 1)#define TVR_MASK (TVR_SIZE - 1)struct tvec {	struct list_head vec[TVN_SIZE];};struct tvec_root {	struct list_head vec[TVR_SIZE];}; //定时器双链表的表头struct tvec_base {	spinlock_t lock;	struct timer_list *running_timer; //正在执行的timer_list	unsigned long timer_jiffies; //上文说的基准时间	unsigned long next_timer; //距离timer_jiffies最近的过期时间	struct tvec_root tv1; //tv1--tv5根据过期时间的大小将timer_list放入其中，tv1表示过期时间最短的任务	struct tvec tv2;	struct tvec tv3;	struct tvec tv4;	struct tvec tv5;} ____cacheline_aligned;

上面的几个结构的关系可以用下图表示出来：

struct timer_list对象是放在tv1--tv5中那个struct tvec上是通过timer_list.expire-tvec_base.timer_jiffies来确定的，也就是说，是通过过期时间和基准时间之间的差值来确定struct timer_list对象在哪个tvX上的。

若上面的差值可以在TVR_BITS位内表示出来，则将相应的timer_list放在tv1上，而TVR_BITS内的数值作为timer_list在tvec数组上的索引，将其串到双链表上，若可以在TVR_BITS + TVN_BITS位内表示差值，则将其放在tv2上，TVN_BITS位段内的值当做tv2数组内部的索引，然后依次类推，若差值大于1<<TVR_BITS+3*TVN_BTS，则将其全部放在tv5内，表示过期时间还很长，一段时间内轮不到其执行，最后的TVN_BITS位作为tv5内的索引。

插入定时器的操作可以清晰的看到上面的过程：

static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
	unsigned long expires = timer->expires;
	unsigned long idx = expires - base->timer_jiffies;
	struct list_head *vec;

	if (idx < TVR_SIZE) { //tv1位段内
		int i = expires & TVR_MASK;
		vec = base->tv1.vec + i;
	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) { //tv2位段内
		int i = (expires >> TVR_BITS) & TVN_MASK;
		vec = base->tv2.vec + i;
	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
		vec = base->tv3.vec + i;
	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
		vec = base->tv4.vec + i;
	} else if ((signed long) idx < 0) { //如果插入的定时器已经过期，则将其放在最先过期的tv1保证内迅速执行
		/*
		 * Can happen if you add a timer with expires == jiffies,
		 * or you set a timer to go off in the past
		 */
		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
	} else {
		int i;
		/* If the timeout is larger than 0xffffffff on 64-bit
		 * architectures then we use the maximum timeout:
		 */
		if (idx > 0xffffffffUL) {
			idx = 0xffffffffUL;
			expires = idx + base->timer_jiffies;
		}
		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
		vec = base->tv5.vec + i;
	}
	/*
	 * Timers are FIFO:
	 */
	list_add_tail(&timer->entry, vec);
}

定时器的执行则比较当前时间和过期时间，知道tvec_base里面没有过期的定时器对象为止。但是linux并非在tvec_base遍历所有的timer_list对象，然后将过期的删除，而是通过增加tvec_base.timer_jiffies基准时间，然后执行tvec_base上相对于基准时间过期的定时器。由于timer_list是由过期时间和基准时间的差值放在tvec_base上的，因此每次增加基准时间需要将所有的定时器重新放入新的tvX。由于相同的过期时间放在同一个链表中，因此只要对整个链表的表头操作即可，这里的链表迁移操作由cascade完成：

static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
	/* cascade all the timers from tv up one level */
	struct timer_list *timer, *tmp;
	struct list_head tv_list;

	list_replace_init(tv->vec + index, &tv_list); //将链表头放到tv_list中

	/*
	 * We are removing _all_ timers from the list, so we
	 * don't have to detach them individually.
	 */
	list_for_each_entry_safe(timer, tmp, &tv_list, entry) { //将所有的链表元素重新加到base中
		BUG_ON(tbase_get_base(timer->base) != base);
		internal_add_timer(base, timer);
	}

	return index;
}

#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) /*利用基准时间来计算tvN上的位置，由于timer_list加到base中也是用expire来计算位置的值，因此这个索引就是在tvN中刚好过期的链表的索引*/

static inline void __run_timers(struct tvec_base *base)
{
	struct timer_list *timer;

	spin_lock_irq(&base->lock);
	while (time_after_eq(jiffies, base->timer_jiffies)) {/*比较当前时间和基准，判断base上是否有过期的定时器*/
		struct list_head work_list;
		struct list_head *head = &work_list;
		int index = base->timer_jiffies & TVR_MASK;/*只有当基准时间最后TVR_BITS位内的值重新归零之后才迁移链表,表示若基准时间继续增加，tv1内的过期链表就可能为空，需要重新填充*/

		/*
		 * Cascade timers:
		 */
		if (!index &&
			(!cascade(base, &base->tv2, INDEX(0))) &&
				(!cascade(base, &base->tv3, INDEX(1))) &&
					!cascade(base, &base->tv4, INDEX(2)))/*一直迁移链表，直到某个tvec中没有过期对象*/
			cascade(base, &base->tv5, INDEX(3));
		++base->timer_jiffies;//增加基准时间，准备下一次过期对象
		list_replace_init(base->tv1.vec + index, &work_list);
		while (!list_empty(head)) {
			void (*fn)(unsigned long);
			unsigned long data;

			timer = list_first_entry(head, struct timer_list,entry);/*获取timer_list对象和回调函数*/
			fn = timer->function;
			data = timer->data;

			timer_stats_account_timer(timer);

			base->running_timer = timer;/*设为当前运行定时器，并从tvec_base中删除定时器*/
			detach_timer(timer, 1);

			spin_unlock_irq(&base->lock);
			call_timer_fn(timer, fn, data);//执行回调函数
			spin_lock_irq(&base->lock);
		}
	}
	base->running_timer = NULL;
	spin_unlock_irq(&base->lock);
}

响应中断的函数会调用这个函数执行真正的定时器过期检查：

static void run_timer_softirq(struct softirq_action *h)
{
	struct tvec_base *base = __this_cpu_read(tvec_bases);

	hrtimer_run_pending(); //每次tick都执行，若没有激活高精度定时器则尝试切换至高精度定时器

	if (time_after_eq(jiffies, base->timer_jiffies))
		__run_timers(base);
}

timer wheel定时器的过来大概如此，为系统提供了低精度的定时或过期的要求。

2. hrtimer定时器

hrtimer定时器提供不能类型的定时器请求，包括实时时间基类型、相对于boot的时间基类型，定时器请求使用struct hrtimer表示每一个请求，然后将请求对象放到请求队列中，如CFS，请求队列使用红黑树实现。

/**
 * struct hrtimer - the basic hrtimer structure
 * @node:	timerqueue node, which also manages node.expires,
 *		the absolute expiry time in the hrtimers internal
 *		representation. The time is related to the clock on
 *		which the timer is based. Is setup by adding
 *		slack to the _softexpires value. For non range timers
 *		identical to _softexpires.
 * @_softexpires: the absolute earliest expiry time of the hrtimer.
 *		The time which was given as expiry time when the timer
 *		was armed.
 * @function:	timer expiry callback function
 * @base:	pointer to the timer base (per cpu and per clock)
 * @state:	state information (See bit values above)
 * @start_site:	timer statistics field to store the site where the timer
 *		was started
 * @start_comm: timer statistics field to store the name of the process which
 *		started the timer
 * @start_pid: timer statistics field to store the pid of the task which
 *		started the timer
 *
 * The hrtimer structure must be initialized by hrtimer_init()
 */
struct hrtimer {
	struct timerqueue_node		node; //请求队列中的节点
	ktime_t				_softexpires; //过期时间
	enum hrtimer_restart		(*function)(struct hrtimer *); //回调函数
	struct hrtimer_clock_base	*base; //时间基
	unsigned long			state; //hrtimer的状态，包括是否可用、在队列上、正在执行等
#ifdef CONFIG_TIMER_STATS
	int				start_pid;
	void				*start_site;
	char				start_comm[16];
#endif
};

同timer wheel的定时器，通过一个基准时间来控制定时器对象的过期，hrtimer定时器的基准时间使用hrtimer_clock_base,这个基准时间实例也用来将hrtimer对象组织在active成员所表示的红黑树根节点上。

enum  hrtimer_base_type { //不同类型的基准时间
	HRTIMER_BASE_MONOTONIC,
	HRTIMER_BASE_REALTIME,
	HRTIMER_BASE_BOOTTIME,
	HRTIMER_MAX_CLOCK_BASES,
};

struct hrtimer_clock_base {
	struct hrtimer_cpu_base	*cpu_base; //一个全局控制的实例
	int			index; //clock base在hrtimer_cpu_base的clock_base数组中的索引
	clockid_t		clockid; //clock base的类型
	struct timerqueue_head	active; //hrtimer红黑树的根节点
	ktime_t			resolution; //定时器的精度
	ktime_t			(*get_time)(void); //获取不同类型基准时间的时钟时间
	ktime_t			softirq_time; //基准时间
	ktime_t			offset; /*使用这个成员获得一个单调递增的时间，保证在验证定时器是否过期的时候基准时间不会往回走*/
};

struct hrtimer_cpu_base {	raw_spinlock_t			lock;	unsigned long			active_bases; //一个flag值，表示哪种类型clock_base是可以使用#ifdef CONFIG_HIGH_RES_TIMERS	ktime_t				expires_next; //下一个timer event来的绝对时间，就是下一次的过期时间	int				hres_active; //高精度定时器是否有效	int				hang_detected; //最后一次响应中断时是否被挂起	unsigned long			nr_events; //timer event的个数	unsigned long			nr_retries; //在响应hrtimer软中断时失败重试的次数	unsigned long			nr_hangs; //总的挂起的次数	ktime_t				max_hang_time; //挂起时，中断响应最大运行时间#endif	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES]; //不同类型的基准时钟对象};

由于hrtimer定时器使用专门的时钟设备，因此判断时间是否过期需要从时钟设备上来确认，因此，需要与时钟设备交互，hrtimer通过hrtimer_reprogram来判断一个时间是否是过去的时间：

static int hrtimer_reprogram(struct hrtimer *timer,
			     struct hrtimer_clock_base *base)
{
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); //获得当前cpu的cpu_base对象
	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); /*获得一个精确的和单调基准时间匹配的过期时间*/
	int res;

	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

	/*
	 * When the callback is running, we do not reprogram the clock event
	 * device. The timer callback is either running on a different CPU or
	 * the callback is executed in the hrtimer_interrupt context. The
	 * reprogramming is handled either by the softirq, which called the
	 * callback or at the end of the hrtimer_interrupt.
	 */
	if (hrtimer_callback_running(timer)) //当前定时器正在运行，则直接返回，没什么东西可以检测的
		return 0;

	/*
	 * CLOCK_REALTIME timer might be requested with an absolute
	 * expiry time which is less than base->offset. Nothing wrong
	 * about that, just avoid to call into the tick code, which
	 * has now objections against negative expiry values.
	 */
	if (expires.tv64 < 0)
		return -ETIME;

	if (expires.tv64 >= cpu_base->expires_next.tv64) //如果过期时间在下一次过期时间之后，则肯定没过期
		return 0;

	/*
	 * If a hang was detected in the last timer interrupt then we
	 * do not schedule a timer which is earlier than the expiry
	 * which we enforced in the hang detection. We want the system
	 * to make progress.
	 */
	if (cpu_base->hang_detected) 
		return 0;

	/*
	 * Clockevents returns -ETIME, when the event was in the past.
	 */
	res = tick_program_event(expires, 0); /*从时间设备验证定时器的过期时间是否是过去的时间，不是过去的时间则设置expires为下一次的timer event的时间，若是则返回出错*/
	if (!IS_ERR_VALUE(res))
		cpu_base->expires_next = expires; //不是过去的时间，且比下一次过期时间早，因此重设下一次的过期时间
	return res;
}

若没有激活高精度定时器，在timer wheel中可以看到，每次tick都会尝试切换到高精度定时器模式，切换高精度定时就需要对时钟设备重编程：

static int hrtimer_switch_to_hres(void)
{
	int i, cpu = smp_processor_id();
	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
	unsigned long flags;

	if (base->hres_active)
		return 1;

	local_irq_save(flags);

	if (tick_init_highres()) { //切换到高精度模式，就是将时钟设备的事件响应函数设为后面的hrtimer_interrupt
		local_irq_restore(flags);
		printk(KERN_WARNING "Could not switch to high resolution "
				    "mode on CPU %d\n", cpu);
		return 0;
	}
	base->hres_active = 1; //设置状态，已经激活
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
		base->clock_base[i].resolution = KTIME_HIGH_RES;

	tick_setup_sched_timer();

	/* "Retrigger" the interrupt to get things going */
	retrigger_next_event(NULL); //调整每个基准时钟的offset，并且对时钟设备重编程，设置新的expire_next值
	local_irq_restore(flags);
	return 1;
}

与时钟设备的交互基本都基于hrtimer_reprogram函数就可以了。

向系统请求一个高精度定时器就是生成一个hrtimer对象，设置回调函数并且设置过期时间，然后将其加入到hrtimer对象的红黑树交给系统管理，若加入的对象过期时间比是过期时间最早的一个对象就对时钟设备重编程，大概流程应该就是如此，高精度定时器将所有的增加定时器的请求最后给交给__hrtimer_start_range_ns处理：

int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
		unsigned long delta_ns, const enum hrtimer_mode mode,
		int wakeup)
{
	struct hrtimer_clock_base *base, *new_base;
	unsigned long flags;
	int ret, leftmost;

	base = lock_hrtimer_base(timer, &flags); 

	/* Remove an active timer from the queue: */
	ret = remove_hrtimer(timer, base); //如果已经在队列上，从队列删除，没有再队列上什么都不做

	/* Switch the timer base, if necessary: */
	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); /*如果基准时钟需要切换，则切换基准时钟,会尽量将任务切换到当前cpu的基准时钟下*/

	if (mode & HRTIMER_MODE_REL) {//如果设置的是相对的过期时间，则计算绝对时间*/
		tim = ktime_add_safe(tim, new_base->get_time());
		/*
		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
		 * to signal that they simply return xtime in
		 * do_gettimeoffset(). In this case we want to round up by
		 * resolution when starting a relative timer, to avoid short
		 * timeouts. This will go away with the GTOD framework.
		 */
#ifdef CONFIG_TIME_LOW_RES
		tim = ktime_add_safe(tim, base->resolution);
#endif
	}

	hrtimer_set_expires_range_ns(timer, tim, delta_ns); //设置timer的过期时间

	timer_stats_hrtimer_set_start_info(timer);

	leftmost = enqueue_hrtimer(timer, new_base); /* 将timer对象加入到红黑树，并且如果加入到红黑树中是最左边的对象，也就是说是新加入的定时器是过期时间最早的，就返回1，否则返回0 */

	/*
	 * Only allow reprogramming if the new base is on this CPU.
	 * (it might still be on another CPU if the timer was pending)
	 *
	 * XXX send_remote_softirq() ?
	 */
	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
		hrtimer_enqueue_reprogram(timer, new_base, wakeup); /* 由于是最早的，需要对时钟设备重编程*/

	unlock_hrtimer_base(timer, &flags);

	return ret;
}

定时器实例加入到系统之后就等待执行，等待定时时间到期。高精度定时器响应软中断通过hrtimer_interrupt函数，将所有到期的定时器执行回调函数并且从红黑树上删除：

void hrtimer_interrupt(struct clock_event_device *dev)
{
	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
	ktime_t expires_next, now, entry_time, delta;
	int i, retries = 0;

	BUG_ON(!cpu_base->hres_active);
	cpu_base->nr_events++;
	dev->next_event.tv64 = KTIME_MAX;

	entry_time = now = ktime_get(); //获得当前时间
retry:
	expires_next.tv64 = KTIME_MAX;

	raw_spin_lock(&cpu_base->lock);
	/*
	 * We set expires_next to KTIME_MAX here with cpu_base->lock
	 * held to prevent that a timer is enqueued in our queue via
	 * the migration code. This does not affect enqueueing of
	 * timers which run their callback and need to be requeued on
	 * this CPU.
	 */
	cpu_base->expires_next.tv64 = KTIME_MAX;

	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
		struct hrtimer_clock_base *base;
		struct timerqueue_node *node;
		ktime_t basenow;

		if (!(cpu_base->active_bases & (1 << i)))
			continue;

		base = cpu_base->clock_base + i;
		basenow = ktime_add(now, base->offset); //修正获得一个递增的时间

		while ((node = timerqueue_getnext(&base->active))) { /*从红黑树上获得最左边的节点，这个指针缓存在timer_queue结构中，直接返回即可*/
			struct hrtimer *timer;

			timer = container_of(node, struct hrtimer, node); //contain_of机制获得timer对象

			/*
			 * The immediate goal for using the softexpires is
			 * minimizing wakeups, not running timers at the
			 * earliest interrupt after their soft expiration.
			 * This allows us to avoid using a Priority Search
			 * Tree, which can answer a stabbing querry for
			 * overlapping intervals and instead use the simple
			 * BST we already have.
			 * We don't add extra wakeups by delaying timers that
			 * are right-of a not yet expired timer, because that
			 * timer will have to trigger a wakeup anyway.
			 */

			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { /*过期时间比当前时间晚则在这个clock_base上不存在过期的对象，直接跳过这个clock_base，返回for循环*/
				ktime_t expires;

				expires = ktime_sub(hrtimer_get_expires(timer),
						    base->offset);
				if (expires.tv64 < expires_next.tv64)
					expires_next = expires;
				break;
			}

			__run_hrtimer(timer, &basenow); //执行回调函数并且删除timer对象
		}
	}

	/*
	 * Store the new expiry value so the migration code can verify
	 * against it.
	 */
	cpu_base->expires_next = expires_next; //由上面得出新的下一次过期时间
	raw_spin_unlock(&cpu_base->lock);

	/* Reprogramming necessary ? */
	if (expires_next.tv64 == KTIME_MAX || /*对时间设备重编程，若由于过长的回调函数消耗时间等原因导致expires_next是过去的时间，则重编程失败*/
	    !tick_program_event(expires_next, 0)) {
		cpu_base->hang_detected = 0;
		return;
	}

	/*
	 * The next timer was already expired due to:
	 * - tracing
	 * - long lasting callbacks
	 * - being scheduled away when running in a VM
	 *
	 * We need to prevent that we loop forever in the hrtimer
	 * interrupt routine. We give it 3 attempts to avoid
	 * overreacting on some spurious event.
	 */
	now = ktime_get(); //重编程失败，则重试3次
	cpu_base->nr_retries++;
	if (++retries < 3)
		goto retry;
	/*
	 * Give the system a chance to do something else than looping
	 * here. We stored the entry time, so we know exactly how long
	 * we spent here. We schedule the next event this amount of
	 * time away.
	 */
	cpu_base->nr_hangs++; //如果三次重试都失败，则当前cpu_base挂起
	cpu_base->hang_detected = 1;
	delta = ktime_sub(now, entry_time);
	if (delta.tv64 > cpu_base->max_hang_time.tv64)
		cpu_base->max_hang_time = delta;
	/*
	 * Limit it to a sensible value as we enforce a longer
	 * delay. Give the CPU at least 100ms to catch up.
	 */
	if (delta.tv64 > 100 * NSEC_PER_MSEC)
		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
	else
		expires_next = ktime_add(now, delta);
	tick_program_event(expires_next, 1);/*给CPU最大100ms的时间的来处理cpu_base挂起，重编程时钟设备，在最多100ms后重新产生事件*/
	printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
		    ktime_to_ns(delta));
}

秒客网

linux内核定时器

1. timer wheel定时器

2. hrtimer定时器

相关文章