本文以tcp poll为例子来分析select的源码,下面是函数调用顺序。
select--->sys_select->do_select--->sock_poll--->tcp_poll
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
fd_set_bits fds;
char *bits;
long timeout;
int ret, size, max_fdset; timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) {
time_t sec, usec; if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
|| (ret = __get_user(sec, &tvp->tv_sec))
|| (ret = __get_user(usec, &tvp->tv_usec)))
goto out_nofds; ret = -EINVAL;
if (sec < || usec < )
goto out_nofds; if ((unsigned long) sec < MAX_SELECT_SECONDS) {
timeout = ROUND_UP(usec, /HZ);
timeout += sec * (unsigned long) HZ;
}
} ret = -EINVAL;
if (n < )
goto out_nofds; /* max_fdset can increase, so grab it once to avoid race */
max_fdset = current->files->max_fdset;
if (n > max_fdset)
n = max_fdset; ret = -ENOMEM;
size = FDS_BYTES(n);
bits = select_bits_alloc(size);
if (!bits)
goto out_nofds;
fds.in = (unsigned long *) bits;
fds.out = (unsigned long *) (bits + size);
fds.ex = (unsigned long *) (bits + *size);
fds.res_in = (unsigned long *) (bits + *size);
fds.res_out = (unsigned long *) (bits + *size);
fds.res_ex = (unsigned long *) (bits + *size); /* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex); /* 主要函数 */
ret = do_select(n, &fds, &timeout); if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
time_t sec = , usec = ;
if (timeout) {
sec = timeout / HZ;
usec = timeout % HZ;
usec *= (/HZ);
}
put_user(sec, &tvp->tv_sec);
put_user(usec, &tvp->tv_usec);
} if (ret < )
goto out;
if (!ret) {
ret = -ERESTARTNOHAND;
if (signal_pending(current))
goto out;
ret = ;
} if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT; out:
select_bits_free(bits, size);
out_nofds:
return ret;
}
int do_select(int n, fd_set_bits *fds, long *timeout)
{
struct poll_wqueues table;
poll_table *wait;
int retval, i;
long __timeout = *timeout; spin_lock(¤t->files->file_lock);
retval = max_select_fd(n, fds);
spin_unlock(¤t->files->file_lock); if (retval < )
return retval;
n = retval; poll_initwait(&table);
wait = &table.pt;
if (!__timeout)
wait = NULL;
retval = ;
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; /* 设置当前的进程状态为可中断睡眠状态,但是当前进程还没有被调度出去 */
set_current_state(TASK_INTERRUPTIBLE); inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = ; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = , mask, j;
unsigned long res_in = , res_out = , res_ex = ;
struct file_operations *f_op = NULL;
struct file *file = NULL; /* 这里要跳过一些并没有关心的bit位,浪费了时间 */
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
if (all_bits == ) {
i += __NFDBITS;
continue;
} /* 循环遍历所有关注的bit 位*/
for (j = ; j < __NFDBITS; ++j, ++i, bit <<= ) {
if (i >= n)
break;
if (!(bit & all_bits))
continue;
file = fget(i);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op && f_op->poll)
/* 调用poll函数,将当前进程挂上等待队列,以及设置唤醒函数(驱动收到数据时会调用唤醒函数唤醒进程)。并获取当前关心的fd的可读、可写、异常情况
(套接字的sock_poll 初始化在socket_file_ops)*/
mask = (*f_op->poll)(file, retval ? NULL : wait);
fput(file);
/* 表示可读 */
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
/* 表示可写 */
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
/* 表示异常 */
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
/**
* 如果有必要,就重新调度进程
*/
cond_resched();
}
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
}
/* 遍历完后,检查retval,看是否有可读可写异常,如果有retval不为0,那么则退出死循环 */
wait = NULL;
if (retval || !__timeout || signal_pending(current))
break;
if(table.error) {
retval = table.error;
break;
}
/* 如果上面没有检查到关心的bit位有可读可写异常。如果调用select时设置的是无限等待,
那么下面函数会进行进程调度,将当前进程调度出去。驱动收到数据时会调换用poll函数设置的唤醒函数,来唤醒当前进程对关心的bit位进行重新检查*/
__timeout = schedule_timeout(__timeout);
}
__set_current_state(TASK_RUNNING); poll_freewait(&table); /*
* Up-to-date the caller timeout.
*/
*timeout = __timeout;
return retval;
}
/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table * wait)
{
struct socket *sock; /*
* We can't return errors to poll, so it's either yes or no.
*/
sock = SOCKET_I(file->f_dentry->d_inode);
/* 例子 tcp_poll */
return sock->ops->poll(file, sock, wait);
}
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk); /* 将当前进程加入等待队列,并且有唤醒函数 */
poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
return tcp_listen_poll(sk, wait); mask = ;
if (sk->sk_err)
mask = POLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM; /* Connected? */
if (( << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
/* Potential race condition. If read of tp below will
* escape above sk->sk_state, we can be illegally awaken
* in SYN_* states. */
if ((tp->rcv_nxt != tp->copied_seq) &&
(tp->urg_seq != tp->copied_seq ||
tp->rcv_nxt != tp->copied_seq + ||
sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
&sk->sk_socket->flags);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after
* wspace test but before the flags are set,
* IO signal will be lost.
*/
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
mask |= POLLOUT | POLLWRNORM;
}
} if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
return mask;
}
/*真正的等待处 ,每个监控调用一次 */
void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
{
struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table; if (!table || POLL_TABLE_FULL(table)) {
struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
__set_current_state(TASK_RUNNING);
return;
}
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
table = new_table;
} /* Add a new entry */
{
struct poll_table_entry * entry = table->entry;
table->entry = entry+;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
/* 添加当前进程到等待队列, 这里面含有唤醒函数 */
init_waitqueue_entry(&entry->wait, current);
add_wait_queue(wait_address,&entry->wait);
}
}
/**
* 非互斥进程由default_wake_function唤醒。它是try_to_wake_up的一个简单封装。
*/
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
task_t *p = curr->task;
return try_to_wake_up(p, mode, sync);
}
/**
* 通过把进程状态设置为TASK_RUNNING,并把该进程插入本地CPU的运行队列来唤醒睡眠或停止的进程
* p-被唤醒进程的描述符
* state-可以被唤醒的进程状态掩码。
* sync-一个标志,用来禁止被唤醒的进程抢占本地CPU上正在运行的进程。
*/
static int try_to_wake_up(task_t * p, unsigned int state, int sync)
{
int cpu, this_cpu, success = ;
unsigned long flags;
long old_state;
runqueue_t *rq;
#ifdef CONFIG_SMP
unsigned long load, this_load;
struct sched_domain *sd;
int new_cpu;
#endif /**
* 调用task_rq_lock来禁止中断,并获得进程所在CPU上的运行队列的锁(可能与当前CPU的运行队列不一样,并且被唤醒的进程可能并不在队列上)
*/
rq = task_rq_lock(p, &flags);
schedstat_inc(rq, ttwu_cnt);
old_state = p->state;
/**
* 只唤醒state对应状态的进程。如果被唤醒的进程状态不在state中,直接退出。本次唤醒无效。
* 例如:通过信号就不会唤醒TASK_UNINTERRUPTIBLE状态的进程。
*/
if (!(old_state & state))
goto out; /**
* 如果进程已经属于某个运行队列,就跳转到out_running,将它的状态修改为TASK_RUNNING状态后退出。
*/
if (p->array)
goto out_running; cpu = task_cpu(p);
this_cpu = smp_processor_id(); #ifdef CONFIG_SMP
/**
* 在SMP上,需要检查被唤醒的进程是否应该从最近运行的CPU的运行队列迁移到另外一个CPU的运行队列。
*/ /**
* 被唤醒任务正在CPU上运行,不必考虑迁移了。
*/
if (unlikely(task_running(rq, p)))
goto out_activate; /**
* 优先将进程放到进程所在CPU上运行。
*/
new_cpu = cpu; /**
* 如果进程所在CPU就是当前进程所在CPU,或者被唤醒进程不允许在当前进程所在CPU上运行,那么跳转到out_set_cpu
*/
if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu; load = source_load(cpu);
this_load = target_load(this_cpu); /*
* If sync wakeup then subtract the (maximum possible) effect of
* the currently running task from the load of the current CPU:
*/
if (sync)
this_load -= SCHED_LOAD_SCALE; /* Don't pull the task off an idle CPU to a busy one */
/**
* 如果被唤醒任务所在的CPU工作量小于当前CPU的工作量,也跳转到out_set_cpu
*/
if (load < SCHED_LOAD_SCALE/ && this_load > SCHED_LOAD_SCALE/)
goto out_set_cpu; /**
* 试图将进程迁移到本地CPU。
*/
new_cpu = this_cpu; /* Wake to this CPU if we can */ /*
* Scan domains for affine wakeup and passive balancing
* possibilities.
*/
for_each_domain(this_cpu, sd) {
unsigned int imbalance;
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
*/
imbalance = sd->imbalance_pct + (sd->imbalance_pct - ) / ; if ((sd->flags & SD_WAKE_AFFINE) &&
!task_hot(p, rq->timestamp_last_tick, sd)) {
/*
* This domain has SD_WAKE_AFFINE and p is cache cold
* in this domain.
*/
if (cpu_isset(cpu, sd->span)) {
schedstat_inc(sd, ttwu_wake_affine);
goto out_set_cpu;
}
} else if ((sd->flags & SD_WAKE_BALANCE) &&
imbalance*this_load <= *load) {
/*
* This domain has SD_WAKE_BALANCE and there is
* an imbalance.
*/
if (cpu_isset(cpu, sd->span)) {
schedstat_inc(sd, ttwu_wake_balance);
goto out_set_cpu;
}
}
} new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
schedstat_inc(rq, ttwu_attempts);
new_cpu = wake_idle(new_cpu, p);
if (new_cpu != cpu) {
schedstat_inc(rq, ttwu_moved);
set_task_cpu(p, new_cpu);
task_rq_unlock(rq, &flags);
/* might preempt at this point */
rq = task_rq_lock(p, &flags);
old_state = p->state;
if (!(old_state & state))
goto out;
if (p->array)
goto out_running; this_cpu = smp_processor_id();
cpu = task_cpu(p);
} out_activate:
#endif /* CONFIG_SMP */
/**
* 如果是TASK_UNINTERRUPTIBLE,就递减nr_uninterruptible
* 并将activated设为-1,表示进程是从TASK_UNINTERRUPTIBLE状态被唤醒这个事实。
*/
if (old_state == TASK_UNINTERRUPTIBLE) {
rq->nr_uninterruptible--;
/*
* Tasks on involuntary sleep don't earn
* sleep_avg beyond just interactive state.
*/
p->activated = -;
} /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
* this cpu. (in this case the 'I will reschedule' promise of
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
/**
* activate_task函数依次执行以下步骤澹?
* 1:调用sched_clock获得当前时间戳,如果目标CPU不是本地CPU,那么还会补偿时钟中断的偏差。
* 2:调用recalc_task_prio,计算进程的动态优先级。
* 3:根据情况设置activated
* 4:设置进程的时间戳。
* 5:将进程插入进程集合。
*/
activate_task(p, rq, cpu == this_cpu);
/**
* 如果目标CPU不是本地CPU,或者没有SYNC标志,就检查新进程的动态优先级是否比运行队列中当前进程的优先级高。
*/
if (!sync || cpu != this_cpu) {
if (TASK_PREEMPTS_CURR(p, rq))/* 进程的优先级比所在队列的当前进程优先级高,需要抢占。 */
/**
* resched_task函数进行进程抢占。
* 在单处理器上,它仅仅设置TIF_NEED_RESCHED标志。
* 在多处理器上,它可能会发送IPI,强制让CPU产生调度。
*/
resched_task(rq->curr);
}
success = ; out_running:
/**
* 将进程状态设置为为TASK_RUNNING,注意两个流程会走到这里。
*/
p->state = TASK_RUNNING;
out:
/**
* 开中断并打开运行队列的锁。
*/
task_rq_unlock(rq, &flags); /**
* 返回0:进程没有被唤醒。否则返回1,进程被唤醒。
*/
return success;
}
当底层驱动收到数据后,会产生中断信号,调用 default_wake_function函数来唤醒对应的进程,唤醒后进程继续do_select来检查关心的bit位。至于驱动具体是如何通知上层的,还需要进一步学习与分析。