Linux Netlink通信机制详解(下)

时间:2022-09-16 16:14:48
  这里我以路由中的netlink为例,看一下内核中的处理流程是怎么样的!在/kernel/net/core/rtnetlink.c文件中,有一个接收从用户空间过来的Netlink消息的函数。
  1. static void rtnetlink_rcv(struct sock*sk, int len)
  2. {
  3.     unsigned int qlen = 0;

  4.     do {
  5.         rtnl_lock();
  6.         netlink_run_queue(sk,&qlen, &rtnetlink_rcv_msg);
  7.         up(&rtnl_sem);

  8.         netdev_run_todo();
  9.     } while(qlen);
  10. }

上面的内核函数就是用来接收用户路由方面Netlink消息的,当我们使用route命令添加一条路由时,就会调用该函数接收。该函数是再netlink的初始化是注册的。同样在rtnetlink.c文件中。

  1. void __init rtnetlink_init(void)
  2. {
  3.     int i;

  4.     rtattr_max = 0;
  5.     for (i= 0; i < ARRAY_SIZE(rta_max); i++)
  6.         if (rta_max[i]> rtattr_max)
  7.             rtattr_max = rta_max[i];
  8.     rta_buf = kmalloc(rtattr_max* sizeof(struct rtattr*), GFP_KERNEL);
  9.     if (!rta_buf)
  10.         panic("rtnetlink_init: cannot allocate rta_buf\n");

  11.     rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
  12.      THIS_MODULE);//在创建内核的netlink时,注册了路由netlink的接收函数,rtnetlink_rcv.
  13.     if (rtnl== NULL)
  14.         panic("rtnetlink_init: cannot initialize rtnetlink\n");
  15.     netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
  16.     register_netdevice_notifier(&rtnetlink_dev_notifier);
  17.     rtnetlink_links[PF_UNSPEC]= link_rtnetlink_table;
  18.     rtnetlink_links[PF_PACKET]= link_rtnetlink_table;
  19. }

在netlink_kernel_create函数中,可以看到内核接收用户空间传过来的消息的接收函数,

  1. struct sock *
  2. netlink_kernel_create(int unit, unsignedint groups,
  3.                       void (*input)(struct sock*sk, int len),
  4.                       struct module *module)
  5. {
  6.     struct socket *sock;
  7.     struct sock *sk;
  8.     struct netlink_sock *nlk;

  9.     if (!nl_table)
  10.         return NULL;

  11.     if (unit<0|| unit>=MAX_LINKS)
  12.         return NULL;

  13.     if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit,&sock))
  14.         return NULL;

  15.     if (__netlink_create(sock, unit)< 0)
  16.         goto out_sock_release;

  17.     sk = sock->sk;
  18.     sk->sk_data_ready= netlink_data_ready;
  19.     if (input)
  20.         nlk_sk(sk)->data_ready= input;//设置内核接收Netlink消息的函数,这里就是前面的rtnetlink_rcv函数

  21.     if (netlink_insert(sk, 0))
  22.         goto out_sock_release;

  23.     nlk = nlk_sk(sk);//取得sock嵌入的netlink_sock结构体
  24.     nlk->flags|= NETLINK_KERNEL_SOCKET;

  25.     netlink_table_grab();
  26.     nl_table[unit].groups= groups < 32? 32 : groups;
  27.     nl_table[unit].module= module;
  28.     nl_table[unit].registered= 1;// 更新netlink_table结构体信息,每中协议对应一个netlink_
  29. table结构
  30.     netlink_table_ungrab();

  31.     return sk;

  32. out_sock_release:
  33.     sock_release(sock);
  34.     return NULL;
  35. }

   到此,内核创建netlink到接收用户空间发送过来消息整个流程就清晰了。那当我们添加一条新路由时,在接收函数rtnetlink_rcv中的循环中,会从一个队列中调用实际的接收处理函数,这里为rtnetlink_rcv_msg函数。

  1. /**
  2.  * nelink_run_queue - Process netlink receive queue.
  3.  * @sk: Netlink socket containing the queue
  4.  * @qlen: Placeto store queue length upon entry
  5.  * @cb: Callbackfunction invoked foreach netlink message found
  6.  *
  7.  * Processes as much as there was in the queue upon entry and invokes
  8.  * a callback function for each netlink message found. The callback
  9.  * function may refuse a message by returning a negativeerror code
  10.  * but setting the error pointer to 0 in which case this function
  11.  * returns with a qlen != 0.
  12.  *
  13.  * qlen must be initialized to 0 before the initial entry, afterwards
  14.  * the function may be called repeatedlyuntil qlen reaches 0.
  15.  */
  16. void netlink_run_queue(struct sock *sk, unsigned int *qlen,
  17.          int (*cb)(struct sk_buff*, struct nlmsghdr*, int *))
  18. {
  19.     struct sk_buff *skb;

  20.     if (!*qlen|| *qlen > skb_queue_len(&sk->sk_receive_queue))
  21.         *qlen = skb_queue_len(&sk->sk_receive_queue);

  22.     for (;*qlen; (*qlen)--){
  23.         skb = skb_dequeue(&sk->sk_receive_queue);
  24.         if (netlink_rcv_skb(skb, cb)){
  25.             if (skb->len)
  26.                 skb_queue_head(&sk->sk_receive_queue, skb);
  27.             else {
  28.                 kfree_skb(skb);
  29.                 (*qlen)--;
  30.             }
  31.             break;
  32.         }

  33.         kfree_skb(skb);
  34.     }
  35. }

  下面是rtnetlink_rcv_msg()函数的实现,对netlink消息进行相应的处理。其中有一个数据结构

  struct rtnetlink_link *link; 其定义如下:是两个不同的处理函数

  1. struct rtnetlink_link
  2. {
  3.     int (*doit)(struct sk_buff*, struct nlmsghdr*, void*attr);
  4.     int (*dumpit)(struct sk_buff*, struct netlink_callback*cb);
  5. };
  6. /* Process one rtnetlink message.*/

  7. static __inline__ int
  8. rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int*errp)
  9. {
  10.     struct rtnetlink_link *link;
  11.     struct rtnetlink_link *link_tab;
  12.     int sz_idx, kind;
  13.     int min_len;
  14.     int family;
  15.     int type;
  16.     int err;

  17.     /* Only requests are handled by kernelnow */
  18.     if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
  19.         return 0;
  20.     type = nlh->nlmsg_type;
  21.     /* A control message: ignore them*/
  22.     if (type< RTM_BASE)
  23.         return 0;
  24.     /* Unknown message: reply with EINVAL*/
  25.     if (type> RTM_MAX)
  26.         goto err_inval;
  27.     type -= RTM_BASE;
  28.     /* All the messages must have at least 1 byte length*/
  29.     if (nlh->nlmsg_len< NLMSG_LENGTH(sizeof(struct rtgenmsg)))
  30.         return 0;
  31.     family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
  32.     if (family>= NPROTO){
  33.         *errp = -EAFNOSUPPORT;
  34.         return -1;
  35.     }

  36.     link_tab = rtnetlink_links[family];//根据用户空间传过来的不同德family类型,调用不同的处理函数,这里以路由为例的话为AF_ROUTE或者AF_NETLINK
  37.     if (link_tab== NULL)
  38.         link_tab = rtnetlink_links[PF_UNSPEC];
  39.     link =&link_tab[type];//根据不同的type调用不同的处理函数。这里的type为RTM_NEWROUTE

  40.     sz_idx = type>>2;
  41.     kind = type&3;

  42.     if (kind!= 2 && security_netlink_recv(skb)){
  43.         *errp = -EPERM;
  44.         return -1;
  45.     }

  46.     if (kind== 2 && nlh->nlmsg_flags&NLM_F_DUMP){
  47.         if (link->dumpit== NULL)
  48.             link = &(rtnetlink_links[PF_UNSPEC][type]);

  49.         if (link->dumpit== NULL)
  50.             goto err_inval;

  51.         if ((*errp= netlink_dump_start(rtnl, skb, nlh,
  52.                         link->dumpit,NULL))!= 0){
  53.             return -1;
  54.         }

  55.         netlink_queue_skip(nlh, skb);
  56.         return -1;
  57.     }

  58.     memset(rta_buf, 0,(rtattr_max * sizeof(struct rtattr*)));

  59.     min_len = rtm_min[sz_idx];
  60.     if (nlh->nlmsg_len< min_len)
  61.         goto err_inval;

  62.     if (nlh->nlmsg_len> min_len) {
  63.         int attrlen = nlh->nlmsg_len- NLMSG_ALIGN(min_len);
  64.         struct rtattr *attr = (void*)nlh+ NLMSG_ALIGN(min_len);

  65.         while (RTA_OK(attr, attrlen)){
  66.             unsigned flavor = attr->rta_type;
  67.             if (flavor) {
  68.                 if (flavor > rta_max[sz_idx])
  69.                     goto err_inval;
  70.                 rta_buf[flavor-1]= attr;
  71.             }
  72.             attr = RTA_NEXT(attr, attrlen);
  73.         }
  74.     }

  75.     if (link->doit== NULL)
  76.         link =&(rtnetlink_links[PF_UNSPEC][type]);
  77.     if (link->doit== NULL)
  78.         goto err_inval;
  79.     err =link->doit(skb, nlh,(void *)&rta_buf[0]);//此处调用RTM_NEWROUTE,对应的route处理函数,也就是下面的inet6_rtm_newroute函数。

  80.     *errp =err;
  81.     return err;

  82. err_inval:
  83.     *errp =-EINVAL;
  84.     return -1;
  85. }
  86. int inet6_rtm_newroute(struct sk_buff*skb, struct nlmsghdr* nlh, void*arg)
  87. {
  88.     struct rtmsg *r = NLMSG_DATA(nlh);
  89.     struct in6_rtmsg rtmsg;

  90.     if (inet6_rtm_to_rtmsg(r, arg,&rtmsg))
  91.         return -EINVAL;
  92.     return ip6_route_add(&rtmsg, nlh, arg,&NETLINK_CB(skb));
  93. }

inet6_rtm_newroute函数通过下面的数组进行了相应的注册处理,所以上面的link->doit(skb, nlh, (void *)&rta_buf[0])就是根据下面的这个调用的。

  1. static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES]= {
  2.     [RTM_GETLINK - RTM_BASE] = { .dumpit    = inet6_dump_ifinfo,},
  3.     [RTM_NEWADDR - RTM_BASE] = { .doit    = inet6_rtm_newaddr,},
  4.     [RTM_DELADDR - RTM_BASE] = { .doit    = inet6_rtm_deladdr,},
  5.     [RTM_GETADDR - RTM_BASE] = { .dumpit    = inet6_dump_ifaddr,},
  6.     [RTM_GETMULTICAST - RTM_BASE] = { .dumpit= inet6_dump_ifmcaddr,},
  7.     [RTM_GETANYCAST - RTM_BASE] = { .dumpit    = inet6_dump_ifacaddr,},
  8.     [RTM_NEWROUTE - RTM_BASE] = { .doit    = inet6_rtm_newroute,},
  9.     [RTM_DELROUTE - RTM_BASE] = { .doit    = inet6_rtm_delroute,},
  10.     [RTM_GETROUTE - RTM_BASE] = { .doit    = inet6_rtm_getroute,
  11.                  .dumpit    = inet6_dump_fib,},
  12. };

相关的结构体:

内核中所有的netlink套接字存储在一个全局的哈新表中,该结构定义如下

static struct netlink_table *nl_table;其中每个协议对应一个哈希表,所有的同一种协议的数

据报散列在同哈希表中

下面为一种协议所连接的哈希表结构:

struct netlink_table {

         struct nl_pid_hashhash; // 根据pid进行HASH的netlink sock链表, 相当于客户端链表

         struct hlist_headmc_list; // 多播的sock链表

         unsigned int nl_nonroot;// 监听者标志

         unsigned int groups;// 每个netlink的协议类型可以定义多个组, 8的倍数,最小是32

         struct module *module;

         int registered;

};最大可有MAX_LINKS(32)个表,处理不同协议类型的netlink套接口, 注意由于是自身的通信, 本机

同时作为服务器和客户端, 服务端需要一个套接口对应,每个客户端也要有一个套接口对应, 多个客户端的套接口形成一个链表.

  1. struct hlist_head *table;// 链表节点,每个桶中协议的sock连入其中,根据哈希值可得确定
  2. 的sock
  3.     unsigned long rehash_time; // 重新计算HASH的时间间隔

  4.     unsigned int mask;
  5.     unsigned int shift;

  6.     unsigned int entries;// 链表节点数
  7.     unsigned int max_shift;// 最大幂值
  8.     u32 rnd;// 随机数
  9. };

在kernel/include/linux/Net.h中

  1. struct proto_ops {
  2.     int        family;
  3.     struct module    *owner;
  4.     int        (*release)(struct socket *sock);
  5.     int        (*bind)    (struct socket *sock,
  6.                  struct sockaddr *myaddr,
  7.                  int sockaddr_len);
  8.     int        (*connect)(struct socket *sock,
  9.                  struct sockaddr *vaddr,
  10.                  int sockaddr_len,int flags);
  11.     int        (*socketpair)(struct socket*sock1,
  12.                  struct socket *sock2);
  13.     int        (*accept)(struct socket *sock,
  14.                  struct socket *newsock,int flags);
  15.     int        (*getname)(struct socket *sock,
  16.                  struct sockaddr *addr,
  17.                  int *sockaddr_len, int peer);
  18.     unsigned int    (*poll)    (struct file *file, struct socket*sock,
  19.                  struct poll_table_struct *wait);
  20.     int        (*ioctl)(struct socket *sock, unsignedint cmd,
  21.                  unsigned long arg);
  22.     int        (*listen)(struct socket *sock,int len);
  23.     int        (*shutdown)(struct socket *sock,int flags);
  24.     int        (*setsockopt)(struct socket*sock, int level,
  25.                  int optname, char __user*optval, int optlen);
  26.     int        (*getsockopt)(struct socket*sock, int level,
  27.                  int optname, char __user*optval, int __user *optlen);
  28.     int        (*sendmsg)(struct kiocb *iocb, struct socket*sock,//netlink套接字实际的发送与接收函数
  29.                  struct msghdr *m, size_t total_len);
  30.     int        (*recvmsg)(struct kiocb *iocb, struct socket*sock,
  31.                  struct msghdr *m, size_t total_len,
  32.                  int flags);
  33.     int        (*mmap)    (struct file *file, struct socket*sock,
  34.                  struct vm_area_struct * vma);
  35.     ssize_t        (*sendpage)(struct socket *sock, struct page*page,
  36.                  int offset, size_t size,int flags);
  37. };

下面我们看看,当我们使用route命令添加一个新的路由是,这个函数的调用顺序是怎么样的。下面是主要的函数;

Dput()

sys_sendmsg()//内核的接受函数

new_inode()

netlink_sendmsg//内核态接收用户态发送的数据

rtnetlink_rcv()

netlink_run_queue()

rtnetlink_rcv_msg()

inet6_rtm_newroute()

在kernel/net/netlink/af_netlink.c文件中,内核态接收用户态发送的数据,在netlink_sendskb函数中调用sock的队列,执行相应的netlink接收函数

  1. static int netlink_sendmsg(struct kiocb*kiocb, struct socket*sock,
  2.              struct msghdr *msg, size_tlen)
  3. {
  4.     struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
  5.     struct sock *sk = sock->sk;
  6.     struct netlink_sock *nlk = nlk_sk(sk);
  7.     struct sockaddr_nl *addr=msg->msg_name;
  8.     u32 dst_pid;
  9.     u32 dst_group;
  10.     struct sk_buff *skb;
  11.     int err;
  12.     struct scm_cookie scm;

  13.     if (msg->msg_flags&MSG_OOB)
  14.         return -EOPNOTSUPP;

  15.     if (NULL== siocb->scm)
  16.         siocb->scm= &scm;
  17.     err = scm_send(sock, msg, siocb->scm);
  18.     if (err< 0)
  19.         return err;

  20.     if (msg->msg_namelen){
  21.         if (addr->nl_family!= AF_NETLINK)
  22.             return -EINVAL;
  23.         dst_pid = addr->nl_pid;
  24.         dst_group = ffs(addr->nl_groups);
  25.         if (dst_group&& !netlink_capable(sock, NL_NONROOT_SEND))
  26.             return -EPERM;
  27.     } else{
  28.         dst_pid = nlk->dst_pid;
  29.         dst_group = nlk->dst_group;
  30.     }

  31.     if (!nlk->pid){
  32.         err = netlink_autobind(sock);
  33.         if (err)
  34.             goto out;
  35.     }

  36.     err =-EMSGSIZE;
  37.     if (len> sk->sk_sndbuf- 32)
  38.         goto out;
  39.     err =-ENOBUFS;
  40.     skb = alloc_skb(len, GFP_KERNEL);// 分配一个sk_buff结构,将msghdr结构转化为sk_buff结构
  41.     if (skb==NULL)
  42.         goto out;

  43.     NETLINK_CB(skb).pid    = nlk->pid;//填写本地的pid信息
  44.     NETLINK_CB(skb).dst_pid= dst_pid;
  45.     NETLINK_CB(skb).dst_group= dst_group;
  46.     NETLINK_CB(skb).loginuid= audit_get_loginuid(current->audit_context);
  47.     memcpy(NETLINK_CREDS(skb),&siocb->scm->creds, sizeof(struct ucred));

  48.     /* What can Ido? Netlinkis asynchronous, so that
  49.      we will have to save current capabilitiesto
  50.      check them, when this message will be delivered
  51.      to corresponding kernel module.--ANK (980802)
  52.      */

  53.     err =-EFAULT;
  54. //数据拷贝进sk_buff中
  55.     if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov,len)){
  56.         kfree_skb(skb);
  57.         goto out;
  58.     }

  59.     err = security_netlink_send(sk, skb);
  60.     if (err){
  61.         kfree_skb(skb);
  62.         goto out;
  63.     }

  64.     if (dst_group){
  65.         atomic_inc(&skb->users);
  66.         netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
  67.     }
  68.     err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);

  69. out:
  70.     return err;
  71. }