- static void rtnetlink_rcv(struct sock*sk, int len)
-
{
- unsigned int qlen = 0;
- do {
- rtnl_lock();
- netlink_run_queue(sk,&qlen, &rtnetlink_rcv_msg);
- up(&rtnl_sem);
- netdev_run_todo();
- } while(qlen);
- }
上面的内核函数就是用来接收用户路由方面Netlink消息的,当我们使用route命令添加一条路由时,就会调用该函数接收。该函数是再netlink的初始化是注册的。同样在rtnetlink.c文件中。
- void __init rtnetlink_init(void)
-
{
- int i;
- rtattr_max = 0;
- for (i= 0; i < ARRAY_SIZE(rta_max); i++)
- if (rta_max[i]> rtattr_max)
- rtattr_max = rta_max[i];
- rta_buf = kmalloc(rtattr_max* sizeof(struct rtattr*), GFP_KERNEL);
- if (!rta_buf)
- panic("rtnetlink_init: cannot allocate rta_buf\n");
- rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
- THIS_MODULE);//在创建内核的netlink时,注册了路由netlink的接收函数,rtnetlink_rcv.
- if (rtnl== NULL)
- panic("rtnetlink_init: cannot initialize rtnetlink\n");
- netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
- register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnetlink_links[PF_UNSPEC]= link_rtnetlink_table;
- rtnetlink_links[PF_PACKET]= link_rtnetlink_table;
- }
在netlink_kernel_create函数中,可以看到内核接收用户空间传过来的消息的接收函数,
- struct sock *
- netlink_kernel_create(int unit, unsignedint groups,
- void (*input)(struct sock*sk, int len),
- struct module *module)
-
{
- struct socket *sock;
- struct sock *sk;
- struct netlink_sock *nlk;
- if (!nl_table)
- return NULL;
- if (unit<0|| unit>=MAX_LINKS)
- return NULL;
- if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit,&sock))
- return NULL;
- if (__netlink_create(sock, unit)< 0)
- goto out_sock_release;
- sk = sock->sk;
- sk->sk_data_ready= netlink_data_ready;
- if (input)
- nlk_sk(sk)->data_ready= input;//设置内核接收Netlink消息的函数,这里就是前面的rtnetlink_rcv函数
- if (netlink_insert(sk, 0))
- goto out_sock_release;
- nlk = nlk_sk(sk);//取得sock嵌入的netlink_sock结构体
- nlk->flags|= NETLINK_KERNEL_SOCKET;
- netlink_table_grab();
- nl_table[unit].groups= groups < 32? 32 : groups;
- nl_table[unit].module= module;
- nl_table[unit].registered= 1;// 更新netlink_table结构体信息,每中协议对应一个netlink_
- table结构
- netlink_table_ungrab();
- return sk;
- out_sock_release:
- sock_release(sock);
- return NULL;
- }
到此,内核创建netlink到接收用户空间发送过来消息整个流程就清晰了。那当我们添加一条新路由时,在接收函数rtnetlink_rcv中的循环中,会从一个队列中调用实际的接收处理函数,这里为rtnetlink_rcv_msg函数。
- /**
- * nelink_run_queue - Process netlink receive queue.
- * @sk: Netlink socket containing the queue
- * @qlen: Placeto store queue length upon entry
- * @cb: Callbackfunction invoked foreach netlink message found
- *
- * Processes as much as there was in the queue upon entry and invokes
- * a callback function for each netlink message found. The callback
- * function may refuse a message by returning a negativeerror code
- * but setting the error pointer to 0 in which case this function
- * returns with a qlen != 0.
- *
- * qlen must be initialized to 0 before the initial entry, afterwards
- * the function may be called repeatedlyuntil qlen reaches 0.
- */
- void netlink_run_queue(struct sock *sk, unsigned int *qlen,
- int (*cb)(struct sk_buff*, struct nlmsghdr*, int *))
-
{
- struct sk_buff *skb;
- if (!*qlen|| *qlen > skb_queue_len(&sk->sk_receive_queue))
- *qlen = skb_queue_len(&sk->sk_receive_queue);
- for (;*qlen; (*qlen)--){
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (netlink_rcv_skb(skb, cb)){
- if (skb->len)
- skb_queue_head(&sk->sk_receive_queue, skb);
- else {
- kfree_skb(skb);
- (*qlen)--;
- }
- break;
- }
- kfree_skb(skb);
- }
- }
下面是rtnetlink_rcv_msg()函数的实现,对netlink消息进行相应的处理。其中有一个数据结构
struct rtnetlink_link *link; 其定义如下:是两个不同的处理函数
- struct rtnetlink_link
-
{
- int (*doit)(struct sk_buff*, struct nlmsghdr*, void*attr);
- int (*dumpit)(struct sk_buff*, struct netlink_callback*cb);
-
};
-
/* Process one rtnetlink message.*/
- static __inline__ int
- rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int*errp)
-
{
- struct rtnetlink_link *link;
- struct rtnetlink_link *link_tab;
- int sz_idx, kind;
- int min_len;
- int family;
- int type;
- int err;
- /* Only requests are handled by kernelnow */
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
- type = nlh->nlmsg_type;
- /* A control message: ignore them*/
- if (type< RTM_BASE)
- return 0;
- /* Unknown message: reply with EINVAL*/
- if (type> RTM_MAX)
- goto err_inval;
- type -= RTM_BASE;
- /* All the messages must have at least 1 byte length*/
- if (nlh->nlmsg_len< NLMSG_LENGTH(sizeof(struct rtgenmsg)))
- return 0;
- family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
- if (family>= NPROTO){
- *errp = -EAFNOSUPPORT;
- return -1;
- }
- link_tab = rtnetlink_links[family];//根据用户空间传过来的不同德family类型,调用不同的处理函数,这里以路由为例的话为AF_ROUTE或者AF_NETLINK
- if (link_tab== NULL)
- link_tab = rtnetlink_links[PF_UNSPEC];
- link =&link_tab[type];//根据不同的type调用不同的处理函数。这里的type为RTM_NEWROUTE
- sz_idx = type>>2;
- kind = type&3;
- if (kind!= 2 && security_netlink_recv(skb)){
- *errp = -EPERM;
- return -1;
- }
- if (kind== 2 && nlh->nlmsg_flags&NLM_F_DUMP){
- if (link->dumpit== NULL)
- link = &(rtnetlink_links[PF_UNSPEC][type]);
- if (link->dumpit== NULL)
- goto err_inval;
- if ((*errp= netlink_dump_start(rtnl, skb, nlh,
- link->dumpit,NULL))!= 0){
- return -1;
- }
- netlink_queue_skip(nlh, skb);
- return -1;
- }
- memset(rta_buf, 0,(rtattr_max * sizeof(struct rtattr*)));
- min_len = rtm_min[sz_idx];
- if (nlh->nlmsg_len< min_len)
- goto err_inval;
- if (nlh->nlmsg_len> min_len) {
- int attrlen = nlh->nlmsg_len- NLMSG_ALIGN(min_len);
- struct rtattr *attr = (void*)nlh+ NLMSG_ALIGN(min_len);
- while (RTA_OK(attr, attrlen)){
- unsigned flavor = attr->rta_type;
- if (flavor) {
- if (flavor > rta_max[sz_idx])
- goto err_inval;
- rta_buf[flavor-1]= attr;
- }
- attr = RTA_NEXT(attr, attrlen);
- }
- }
- if (link->doit== NULL)
- link =&(rtnetlink_links[PF_UNSPEC][type]);
- if (link->doit== NULL)
- goto err_inval;
- err =link->doit(skb, nlh,(void *)&rta_buf[0]);//此处调用RTM_NEWROUTE,对应的route处理函数,也就是下面的inet6_rtm_newroute函数。
- *errp =err;
- return err;
- err_inval:
- *errp =-EINVAL;
- return -1;
-
}
-
int inet6_rtm_newroute(struct sk_buff*skb, struct nlmsghdr* nlh, void*arg)
-
{
- struct rtmsg *r = NLMSG_DATA(nlh);
- struct in6_rtmsg rtmsg;
- if (inet6_rtm_to_rtmsg(r, arg,&rtmsg))
- return -EINVAL;
- return ip6_route_add(&rtmsg, nlh, arg,&NETLINK_CB(skb));
- }
inet6_rtm_newroute函数通过下面的数组进行了相应的注册处理,所以上面的link->doit(skb, nlh, (void *)&rta_buf[0])就是根据下面的这个调用的。
- static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES]= {
- [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo,},
- [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr,},
- [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr,},
- [RTM_GETADDR - RTM_BASE] = { .dumpit = inet6_dump_ifaddr,},
- [RTM_GETMULTICAST - RTM_BASE] = { .dumpit= inet6_dump_ifmcaddr,},
- [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr,},
- [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute,},
- [RTM_DELROUTE - RTM_BASE] = { .doit = inet6_rtm_delroute,},
- [RTM_GETROUTE - RTM_BASE] = { .doit = inet6_rtm_getroute,
- .dumpit = inet6_dump_fib,},
- };
相关的结构体:
内核中所有的netlink套接字存储在一个全局的哈新表中,该结构定义如下
static struct netlink_table *nl_table;其中每个协议对应一个哈希表,所有的同一种协议的数
据报散列在同哈希表中
下面为一种协议所连接的哈希表结构:struct netlink_table {
struct nl_pid_hashhash; // 根据pid进行HASH的netlink sock链表, 相当于客户端链表
struct hlist_headmc_list; // 多播的sock链表
unsigned int nl_nonroot;// 监听者标志
unsigned int groups;// 每个netlink的协议类型可以定义多个组, 8的倍数,最小是32
struct module *module;
int registered;
};最大可有MAX_LINKS(32)个表,处理不同协议类型的netlink套接口, 注意由于是自身的通信, 本机
同时作为服务器和客户端, 服务端需要一个套接口对应,每个客户端也要有一个套接口对应, 多个客户端的套接口形成一个链表.
- struct hlist_head *table;// 链表节点,每个桶中协议的sock连入其中,根据哈希值可得确定
- 的sock
- unsigned long rehash_time; // 重新计算HASH的时间间隔
- unsigned int mask;
- unsigned int shift;
- unsigned int entries;// 链表节点数
- unsigned int max_shift;// 最大幂值
- u32 rnd;// 随机数
- };
在kernel/include/linux/Net.h中
- struct proto_ops {
- int family;
- struct module *owner;
- int (*release)(struct socket *sock);
- int (*bind) (struct socket *sock,
- struct sockaddr *myaddr,
- int sockaddr_len);
- int (*connect)(struct socket *sock,
- struct sockaddr *vaddr,
- int sockaddr_len,int flags);
- int (*socketpair)(struct socket*sock1,
- struct socket *sock2);
- int (*accept)(struct socket *sock,
- struct socket *newsock,int flags);
- int (*getname)(struct socket *sock,
- struct sockaddr *addr,
- int *sockaddr_len, int peer);
- unsigned int (*poll) (struct file *file, struct socket*sock,
- struct poll_table_struct *wait);
- int (*ioctl)(struct socket *sock, unsignedint cmd,
- unsigned long arg);
- int (*listen)(struct socket *sock,int len);
- int (*shutdown)(struct socket *sock,int flags);
- int (*setsockopt)(struct socket*sock, int level,
- int optname, char __user*optval, int optlen);
- int (*getsockopt)(struct socket*sock, int level,
- int optname, char __user*optval, int __user *optlen);
- int (*sendmsg)(struct kiocb *iocb, struct socket*sock,//netlink套接字实际的发送与接收函数
- struct msghdr *m, size_t total_len);
- int (*recvmsg)(struct kiocb *iocb, struct socket*sock,
- struct msghdr *m, size_t total_len,
- int flags);
- int (*mmap) (struct file *file, struct socket*sock,
- struct vm_area_struct * vma);
- ssize_t (*sendpage)(struct socket *sock, struct page*page,
- int offset, size_t size,int flags);
- };
下面我们看看,当我们使用route命令添加一个新的路由是,这个函数的调用顺序是怎么样的。下面是主要的函数;
Dput()
sys_sendmsg()//内核的接受函数
new_inode()
netlink_sendmsg//内核态接收用户态发送的数据
rtnetlink_rcv()
netlink_run_queue()
rtnetlink_rcv_msg()
inet6_rtm_newroute()
在kernel/net/netlink/af_netlink.c文件中,内核态接收用户态发送的数据,在netlink_sendskb函数中调用sock的队列,执行相应的netlink接收函数
- static int netlink_sendmsg(struct kiocb*kiocb, struct socket*sock,
- struct msghdr *msg, size_tlen)
-
{
- struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
- struct sock *sk = sock->sk;
- struct netlink_sock *nlk = nlk_sk(sk);
- struct sockaddr_nl *addr=msg->msg_name;
- u32 dst_pid;
- u32 dst_group;
- struct sk_buff *skb;
- int err;
- struct scm_cookie scm;
- if (msg->msg_flags&MSG_OOB)
- return -EOPNOTSUPP;
- if (NULL== siocb->scm)
- siocb->scm= &scm;
- err = scm_send(sock, msg, siocb->scm);
- if (err< 0)
- return err;
- if (msg->msg_namelen){
- if (addr->nl_family!= AF_NETLINK)
- return -EINVAL;
- dst_pid = addr->nl_pid;
- dst_group = ffs(addr->nl_groups);
- if (dst_group&& !netlink_capable(sock, NL_NONROOT_SEND))
- return -EPERM;
- } else{
- dst_pid = nlk->dst_pid;
- dst_group = nlk->dst_group;
- }
- if (!nlk->pid){
- err = netlink_autobind(sock);
- if (err)
- goto out;
- }
- err =-EMSGSIZE;
- if (len> sk->sk_sndbuf- 32)
- goto out;
- err =-ENOBUFS;
- skb = alloc_skb(len, GFP_KERNEL);// 分配一个sk_buff结构,将msghdr结构转化为sk_buff结构
- if (skb==NULL)
- goto out;
- NETLINK_CB(skb).pid = nlk->pid;//填写本地的pid信息
- NETLINK_CB(skb).dst_pid= dst_pid;
- NETLINK_CB(skb).dst_group= dst_group;
- NETLINK_CB(skb).loginuid= audit_get_loginuid(current->audit_context);
- memcpy(NETLINK_CREDS(skb),&siocb->scm->creds, sizeof(struct ucred));
- /* What can Ido? Netlinkis asynchronous, so that
- we will have to save current capabilitiesto
- check them, when this message will be delivered
- to corresponding kernel module.--ANK (980802)
- */
- err =-EFAULT;
-
//数据拷贝进sk_buff中
- if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov,len)){
- kfree_skb(skb);
- goto out;
- }
- err = security_netlink_send(sk, skb);
- if (err){
- kfree_skb(skb);
- goto out;
- }
- if (dst_group){
- atomic_inc(&skb->users);
- netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
- }
- err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
- out:
- return err;
- }