poll 从应用层到内核实现解析

　　poll函数的原型如下所示：

　　int poll(struct pollfd *fds, nfds_t nfds, int timeout);

　　poll可以监视多个描述符的属性变化，其参数的意义如下：

　　参数fds:

　　指向一个结构体数组的第0个元素的指针，每个数组元素都是一个struct pollfd结构，具体如下：

 struct pollfd{

     int fd;            //文件描述符

     short events;    //等待的事件

     short revents;    //实际发生的事件

 };

　　pollfd结构中的fd为文件描述符，events为待监视的事件，由用户进行设置，可选的监视事件和返回事件如下所示：

poll 从应用层到内核实现解析

　　revents为具体产生的事件，由内核进行设置，也就是当某一个设备文件描述符产生了具体事件时，内核会设置revents，并最终返回给用户空间。由上图可以看到，events设置的值，都可能由revents返回。

　　参数nfds：用来指定第一个参数数组中元素的个数

　　参数timeout：超时值，若为-1，则poll永远等待，若为0，则立即返回，若大于0，则为具体的超时时间，单位是毫秒。

　　poll函数执行成功时，返回结构体中 revents 域不为 0 的文件描述符个数；如果在超时前没有任何事件发生，poll()返回 0，如果poll执行失败，返回-1，并设置errno的值，错误值具体如下：

　　EBADF：一个或多个结构体中指定的文件描述符无效。

　　EFAULT：fds 指针指向的地址超出进程的地址空间。

　　EINTR：请求的事件之前产生一个信号，调用可以重新发起。

　　EINVAL：nfds 参数超出 PLIMIT_NOFILE 值。

　　ENOMEM：可用内存不足，无法完成请求。

　　poll的调用路径为sys_poll->do_sys_poll->do_poll->do_pollfd

　　do_sys_poll将用户空间的pollfd拷贝到内核空间，初始化poll_wqueues table对象。代码如下：

 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,

         struct timespec *end_time)

 {

     struct poll_wqueues table;

      int err = -EFAULT, fdcount, len, size;

     /* Allocate small arguments on the stack to save memory and be

        faster - use long to make sure the buffer is aligned properly

        on 64 bit archs to avoid unaligned access */

     long stack_pps[POLL_STACK_ALLOC/sizeof(long)];

     struct poll_list *const head = (struct poll_list *)stack_pps;

      struct poll_list *walk = head;

      unsigned long todo = nfds;

     if (nfds > rlimit(RLIMIT_NOFILE))

         return -EINVAL;

     len = min_t(unsigned int, nfds, N_STACK_PPS);

     //这个for循环会进行一些简单的判断。通常一般都会跳出该for循环。

     for (;;) {

         walk->next = NULL;

         walk->len = len;

         if (!len)

             break;

         //这段代码很关键，将应用程序中通过open函数得到的fd信息，在这里copy给linux内核变量walk->entries。这个时候walk变量就携带有设备文件的fd信息了。

         if (copy_from_user(walk->entries, ufds + nfds-todo,

                     sizeof(struct pollfd) * walk->len))

             goto out_fds;

         todo -= walk->len;

         if (!todo)

             break;

         len = min(todo, POLLFD_PER_PAGE);

         size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;

         walk = walk->next = kmalloc(size, GFP_KERNEL);

         if (!walk) {

             err = -ENOMEM;

             goto out_fds;

         }

     }

     //进行一些初始化的动作

     poll_initwait(&table);

     //关键调用函数，会去调用do_poll函数，通过其返回值来判断其可读的个数，同时table指针是poll_wqueues类型的指针

     //该poll_wqueues结构体包含有poll_table、poll_table_page、task_struct等非常重要的结构体和指针，例如后面要用到的等待队列项wait_queue_t就存放在

     //poll_table_page->poll_table_entry->wait_queue_t中，当然poll_wait函数是可以通过poll_table_struct的地址找到poll_table_entry的。

     //看我们这里的第二个参数head，其实就是walk的地址（poll_list指针类型）。

     fdcount = do_poll(nfds, head, &table, end_time);

     poll_freewait(&table);

     for (walk = head; walk; walk = walk->next) {

         struct pollfd *fds = walk->entries;

         int j;

         for (j = ; j < walk->len; j++, ufds++)

             if (__put_user(fds[j].revents, &ufds->revents))

                 goto out_fds;

       }

     err = fdcount;

 out_fds:

     walk = head->next;

     while (walk) {

         struct poll_list *pos = walk;

         walk = walk->next;

         kfree(pos);

     }

     return err;

 }

　　在do_sys_poll调用do_poll，具体代码如下：

 static int do_poll(unsigned int nfds,  struct poll_list *list,

            struct poll_wqueues *wait, struct timespec *end_time)

 {

     poll_table* pt = &wait->pt;

     ktime_t expire, *to = NULL;

     int timed_out = , count = ;

     unsigned long slack = ;

     /* Optimise the no-wait case */

     if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {

         pt = NULL;

         timed_out = ;

     }

     if (end_time && !timed_out)

         slack = select_estimate_accuracy(end_time);

     for (;;) {

         struct poll_list *walk;

         //看这里面的walk首先会指向list。每一个walk都应该代表一个设备文件，因为walk里面的entries数组只有一个元素用来存放fd信息的

         for (walk = list; walk != NULL; walk = walk->next) {

             struct pollfd * pfd, * pfd_end;

             pfd = walk->entries;

             pfd_end = pfd + walk->len;

             for (; pfd != pfd_end; pfd++) {

                 /*

                  * Fish for events. If we found one, record it

                  * and kill the poll_table, so we don't

                  * needlessly register any other waiters after

                  * this. They'll get immediately deregistered

                  * when we break out and return.

                  */

                 //调用do_pollfd函数，该函数会调用我们自己编写驱动程序的file_operation->poll函数指针指向的函数

                 if (do_pollfd(pfd, pt)) {

                     count++;

                     pt = NULL;

                 }

             }

         }

         /*

          * All waiters have already been registered, so don't provide

          * a poll_table to them on the next loop iteration.

          */

         pt = NULL;

         if (!count) {

             count = wait->error;

             if (signal_pending(current))

                 count = -EINTR;

         }

         if (count || timed_out)

             break;

         /*

          * If this is the first loop and we have a timeout

          * given, then we convert to ktime_t and set the to

          * pointer to the expiry value.

          */

         if (end_time && !to) {

             expire = timespec_to_ktime(*end_time);

             to = &expire;

         }

         if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  //关键函数，该函数会使进程将当前进程休眠，因为在此前该进程已经被标识在等待队列上了

             timed_out = ;

     }

     return count;

 }

　　list中包含了待监视的fd及其相关信息，对list链表中的每一项都执行了do_pollfd(pfd, pt)，并最终调用到驱动程序的poll函数，进一步调用到poll_wait，最终调用到__pollwait。大概完成的工作是为每一个fd分配poll_table_entry并初始化，然后将当前进程封装成一个等待队列项，并将这个等待队列项加入到fd设备的等待队列中。do_pollfd的代码如下：

 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)

 {

     unsigned int mask;

     int fd;

     mask = ;

     fd = pollfd->fd;

     if (fd >= ) {

         int fput_needed;

         struct file * file;

         file = fget_light(fd, &fput_needed);

         mask = POLLNVAL;

         if (file != NULL) {

             mask = DEFAULT_POLLMASK;

             if (file->f_op && file->f_op->poll) {

                 if (pwait)

                     pwait->key = pollfd->events |

                             POLLERR | POLLHUP;

                 mask = file->f_op->poll(file, pwait);  //这句代码很关键，在这里就直接调用驱动程序的poll函数了

             }

             /* Mask out unneeded events. */

             mask &= pollfd->events | POLLERR | POLLHUP;

             fput_light(file, fput_needed);

         }

     }

     pollfd->revents = mask;

     return mask;

 }

　　调用到__pollwait之后差不多就和select函数的调用殊途同归了，将等待队列项加入到设备等待队列的时候同时查看一下设备的状态，如果就绪了，就将状态写会revevts中，这样查询完所有的fd之后就可以返回了。如果查询完所有的fd之后没有设备就绪，那就根据timeout的值判断一下，我们假设timeout的值是大于0的，因为没有设备准备就绪，所以当前进程进入睡眠。等到超时时间到或者被设备就绪信号唤醒时，会再次调用每个fd对用的poll函数，对它们状态再进行一次查询，查询完所有的设备后，revents中也写好了相应的事件，下一步就返回到用户空间中了。

　　poll函数相比于select函数，它没有描述符数量的限制，可以监视任意多个设备。每次返回后events不会被破坏，下一次调用poll可以继续使用。

秒客网

poll 从应用层到内核实现解析

相关文章