Linux虚拟网卡TUN/TAP

时间:2022-11-27 08:19:43

Linux虚拟网卡TUN/TAP



TUN/TAP 提供了给用户空间程序的包的接收和传输,它可以看成是简单的点对点设备或是
以太网设备。它不是从物理设备接收包,而是从用户空间程序接收包。它发送包不是通过物
理设备来发送包,而是将这些包写入用户空间程序来发送。
为了应用这个驱动,应用程序需要打开/dev/net/tun 设备(字符设备),然后发出一个控
制(ioctl)来注册一个网卡设备,一个网络设备将命名为tunXX 或tapXX.依赖于你所设定的标志
位。当应用程序关闭文件描述符的时候,网络设备和其他相关的路由将会消失。
依赖于所选择的设备类型,用户空间的应用程序需要读写IP 包(用tun 设备)或以太网包(用
tap 设备).至于具体用那种设备,依赖于传递给ioctl 函数的标志参数.
Tun/tap 设备的源码包地址是http://vtun.sourceforge.net/tun

包含两个简单的例子,用于显示如何使用tun 设备和tap 设备。两个程序就像是这两个网
络设备接口间的网桥。
br_select.c ‐ bridge based on select system call.
br_sigio.c ‐ bridge based on async io and SIGIO signal.
当然,最好的例子是 is VTun http://vtun.sourceforge.net :))

module_init(tun_init);
module_exit(tun_cleanup);
/* Network device part of the driver */
static LIST_HEAD(tun_dev_list);
static const struct ethtool_ops tun_ethtool_ops;

主要的数据结构
struct miscdevice
struct miscdevice {
int minor;
const char *name;
const struct file_operations *fops;
struct list_head list;
struct device *parent;
struct device *this_device;
};
struct tun_struct
struct tun_struct {
struct list_head list;
unsigned long flags;// //区分tun 和tap 设备

int attached;
uid_t owner;
wait_queue_head_t read_wait;// //等待队列

struct sk_buff_head readq; // //网络缓冲区队列

struct net_device *dev; // //linux 抽象网络设备结构(结构是linux 内核提供的

统一网络设备结构,定义了系统统一的访问接口。)
struct net_device_stats stats; // //网卡状态信息结构

struct fasync_struct *fasync;// //文件异步通知结构

unsigned long if_flags;
u8 dev_addr[ETH_ALEN];
u32 chr_filter[2];
u32 net_filter[2];
#ifdef TUN_DEBUG
int debug;
#endif
};
Struct ifreq
/*
* Interface request structure used for socket
* ioctl's. All interface ioctl's must have parameter
* definitions which begin with ifr_name. The
* remainder may be interface specific.
*/
struct ifreq
{
#define IFHWADDRLEN 6
union
{
char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */
} ifr_ifrn;
union {
struct sockaddr ifru_addr;
struct sockaddr ifru_dstaddr;
struct sockaddr ifru_broadaddr;
struct sockaddr ifru_netmask;
struct sockaddr ifru_hwaddr;
short ifru_flags;
int ifru_ivalue;
int ifru_mtu;
struct ifmap ifru_map;
char ifru_slave[IFNAMSIZ]; /* Just fits the size */
char ifru_newname[IFNAMSIZ];
void __user * ifru_data;
struct if_settings ifru_settings;
} ifr_ifru;
};
模块的初始化(tun_init)
static int __init tun_init(void)
{
int ret = 0;
printk(KERN_INFO "tun: %s, %s/n", DRV_DESCRIPTION, DRV_VERSION);
printk(KERN_INFO "tun: %s/n", DRV_COPYRIGHT);
ret = misc_register(&tun_miscdev);
if (ret)
printk(KERN_ERR "tun: Can't register misc device %d/n", TUN_MINOR);
return ret;
}
static struct miscdevice tun_miscdev = {
.minor = TUN_MINOR,
.name = "tun",
.fops = &tun_fops,
};
static const struct file_operations tun_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = tun_chr_aio_read,
.write = do_sync_write,
.aio_write = tun_chr_aio_write,
.poll = tun_chr_poll,
.ioctl = tun_chr_ioctl,
.open = tun_chr_open,
.release = tun_chr_close,
.fasync = tun_chr_fasync
};
misc_register
//在内核中利用misc_register() 函数将该驱动注册为非标准字符设备驱动,提供字符设备具

有的各种程序接口。
int misc_register(struct miscdevice * misc)
{
struct miscdevice *c;
dev_t dev;
int err = 0;
INIT_LIST_HEAD(&misc‐>list);
mutex_lock(&misc_mtx);
list_for_each_entry(c, &misc_list, list) {
if (c‐>minor == misc‐>minor) {
mutex_unlock(&misc_mtx);
return ‐EBUSY;
}
}
if (misc‐>minor == MISC_DYNAMIC_MINOR) {
int i = DYNAMIC_MINORS;
while (‐‐i >= 0)
if ( (misc_minors[i>>3] & (1 << (i&7))) == 0)
break;
if (i<0) {
mutex_unlock(&misc_mtx);
return ‐EBUSY;
}
misc‐>minor = i;
}
if (misc‐>minor < DYNAMIC_MINORS)
misc_minors[misc‐>minor >> 3] |= 1 << (misc‐>minor & 7);
dev = MKDEV(MISC_MAJOR, misc‐>minor);
misc‐>this_device = device_create(misc_class, misc‐>parent, dev,
"%s", misc‐>name);
if (IS_ERR(misc‐>this_device)) {
err = PTR_ERR(misc‐>this_device);
goto out;
}
/*
* Add it to the front, so that later devices can "override"
* earlier defaults
*/
list_add(&misc‐>list, &misc_list);
out:
mutex_unlock(&misc_mtx);
return err;
}
tun 设备的操作(系统调用)
tun_chr_open(打开设备时调用)
当打开一个tun/tap 设备时,open 函数将调用tun_chr_open()函数,其中将完成一些重要的初始化过
程,
初始化函数以及网络缓冲区链表的初始化和等待队列的初始化
static int tun_chr_open(struct inode *inode, struct file * file)
{
DBG1(KERN_INFO "tunX: tun_chr_open/n");
file‐>private_data = NULL;//初始化设备文件的内容

return 0;
}
tun_chr_ioctl(设备的控制调用接口)
控制调用接口:
Cmd=
.. TUNSETIFF
.. _IOC_TYPE(cmd) == 0x89
.. TUNSETNOCSUM
.. TUNSETPERSIST
.. TUNSETOWNER
.. TUNSETLINK
.. TUNSETDEBUG
.. SIOCGIFFLAGS
.. SIOCSIFFLAGS
.. SIOCGIFHWADDR
.. SIOCSIFHWADDR
.. SIOCADDMULTI
.. SIOCDELMULTI
Tun/tap 驱动中网卡的注册被嵌入了字符驱动的ioctl 例程中,它是通过对字符设备文件描述符利用自
定义的ioctl 设置标志 TUNSETIFF 完成网卡的注册的。
static int tun_chr_ioctl(struct inode *inode, struct file *file,unsigned int cmd, unsigned long arg)
{
struct tun_struct *tun = file‐>private_data;
void __user* argp = (void __user*)arg;
struct ifreq ifr;
if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
if (copy_from_user(&ifr, argp, sizeof ifr))//拷贝用户区的网络设备配置。在用户区已

经分配了ifreq 结构的值和配置值,
return ‐EFAULT;
if (cmd == TUNSETIFF && !tun) {//字符设备文件的数据不是空的则

int err;
ifr.ifr_name[IFNAMSIZ‐] = '/0';
rtnl_lock();//在<linux/rlnetlink.h>中定义

err = tun_set_iff(file, &ifr);
rtnl_unlock();
if (err)
return err;
if (copy_to_user(argp, &ifr, sizeof(ifr)))//把配置数据拷贝到用户区

return ‐EFAULT;
return 0;
}
if (!tun)//tun 设备错误

return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_ioctl cmd %d/n", tun‐>dev‐>name, cmd);
switch (cmd) {
case TUNSETNOCSUM:
/* Disable/Enable checksum */
if (arg)
tun‐>flags |= TUN_NOCHECKSUM;
else
tun‐>flags &= ~TUN_NOCHECKSUM;
DBG(KERN_INFO "%s: checksum %s/n",
tun‐>dev‐>name, arg ? "disabled" : "enabled");
break;
case TUNSETPERSIST:
/* Disable/Enable persist mode */
if (arg)
tun‐>flags |= TUN_PERSIST;
else
tun‐>flags &= ~TUN_PERSIST;
DBG(KERN_INFO "%s: persist %s/n",
tun‐>dev‐>name, arg ? "disabled" : "enabled");
break;
case TUNSETOWNER:
/* Set owner of the device */
tun‐>owner = (uid_t) arg;
DBG(KERN_INFO "%s: owner set to %d/n", tun‐>dev‐>name, tun‐>owner);
break;
case TUNSETLINK:
/* Only allow setting the type when the interface is down */
if (tun‐>dev‐>flags & IFF_UP) {
DBG(KERN_INFO "%s: Linktype set failed because interface is up/n",
tun‐>dev‐>name);
return ‐EBUSY;
} else {
tun‐>dev‐>type = (int) arg;
DBG(KERN_INFO "%s: linktype set to %d/n", tun‐>dev‐>name, tun‐>dev‐>type);
}
break;
#ifdef TUN_DEBUG
case TUNSETDEBUG:
tun‐>debug = arg;
break;
#endif
case SIOCGIFFLAGS:
ifr.ifr_flags = tun‐>if_flags;
if (copy_to_user( argp, &ifr, sizeof ifr))
return ‐EFAULT;
return 0;
case SIOCSIFFLAGS:
/** Set the character device's interface flags. Currently only
* IFF_PROMISC and IFF_ALLMULTI are used. */
tun‐>if_flags = ifr.ifr_flags;
DBG(KERN_INFO "%s: interface flags 0x%lx/n",
tun‐>dev‐>name, tun‐>if_flags);
return 0;
case SIOCGIFHWADDR:
/* Note: the actual net device's address may be different */
memcpy(ifr.ifr_hwaddr.sa_data, tun‐>dev_addr,
min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun‐>dev_addr));
if (copy_to_user( argp, &ifr, sizeof ifr))
return ‐EFAULT;
return 0;
case SIOCSIFHWADDR:
{
/* try to set the actual net device's hw address */
int ret = dev_set_mac_address(tun‐>dev, &ifr.ifr_hwaddr);
if (ret == 0) {
/** Set the character device's hardware address. This is used when
* filtering packets being sent from the network device to the character
* device. */
memcpy(tun‐>dev_addr, ifr.ifr_hwaddr.sa_data,
min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun‐>dev_addr));
DBG(KERN_DEBUG "%s: set hardware address: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
tun‐>dev_addr[0], tun‐>dev_addr[1], tun‐>dev_addr[2],
tun‐>dev_addr[3], tun‐>dev_addr[4], tun‐>dev_addr[5]);
}
return ret;
}
case SIOCADDMULTI:
/** Add the specified group to the character device's multicast filter
* list. */
add_multi(tun‐>chr_filter, ifr.ifr_hwaddr.sa_data);
DBG(KERN_DEBUG "%s: add multi: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
(u8)ifr.ifr_hwaddr.sa_data[0], (u8)ifr.ifr_hwaddr.sa_data[1],
(u8)ifr.ifr_hwaddr.sa_data[2], (u8)ifr.ifr_hwaddr.sa_data[3],
(u8)ifr.ifr_hwaddr.sa_data[4], (u8)ifr.ifr_hwaddr.sa_data[5]);
return 0;
case SIOCDELMULTI:
/** Remove the specified group from the character device's multicast
* filter list. */
del_multi(tun‐>chr_filter, ifr.ifr_hwaddr.sa_data);
DBG(KERN_DEBUG "%s: del multi: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name,
(u8)ifr.ifr_hwaddr.sa_data[0], (u8)ifr.ifr_hwaddr.sa_data[1],
(u8)ifr.ifr_hwaddr.sa_data[2], (u8)ifr.ifr_hwaddr.sa_data[3],
(u8)ifr.ifr_hwaddr.sa_data[4], (u8)ifr.ifr_hwaddr.sa_data[5]);
return 0;
default:
return ‐EINVAL;
};
return 0;
}
tun_chr_aio_read(异步读)(从tun 设备中读取数据)
static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos)
{
struct file *file = iocb‐>ki_filp;
struct tun_struct *tun = file‐>private_data;
DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb;
ssize_t len, ret = 0;
if (!tun)
return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_read/n", tun‐>dev‐>name);
len = iov_total(iv, count);
if (len < 0)
return ‐EINVAL;
add_wait_queue(&tun‐>read_wait, &wait);
while (len) {
const u8 ones[ ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
u8 addr[ ETH_ALEN];
int bit_nr;
current‐>state = TASK_INTERRUPTIBLE;
/* Read frames from the queue */
if (!(skb=skb_dequeue(&tun‐>readq))) {
if (file‐>f_flags & O_NONBLOCK) {
ret = ‐EAGAIN;
break;
}
if (signal_pending(current)) {
ret = ‐ERESTARTSYS;
break;
}
/* Nothing to read, let's sleep */
schedule();
continue;
}
netif_wake_queue(tun‐>dev);
/** Decide whether to accept this packet. This code is designed to
* behave identically to an Ethernet interface. Accept the packet if
* ‐ we are promiscuous.
* ‐ the packet is addressed to us.
* ‐ the packet is broadcast.
* ‐ the packet is multicast and
* ‐ we are multicast promiscous.
* ‐ we belong to the multicast group.
*/
skb_copy_from_linear_data(skb, addr, min_t(size_t, sizeof addr,
skb‐>len));
bit_nr = ether_crc(sizeof addr, addr) >> 26;
if ((tun‐>if_flags & IFF_PROMISC) ||
memcmp(addr, tun‐>dev_addr, sizeof addr) == 0 ||
memcmp(addr, ones, sizeof addr) == 0 ||
(((addr[0] == 1 && addr[1] == 0 && addr[2] == 0x5e) ||
(addr[0] == 0x33 && addr[1] == 0x33)) &&
((tun‐>if_flags & IFF_ALLMULTI) ||
(tun‐>chr_filter[bit_nr >> 5] & (1 << (bit_nr & 31)))))) {
DBG(KERN_DEBUG "%s: tun_chr_readv: accepted: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name, addr[0], addr[1], addr[2],
addr[3], addr[4], addr[5]);
ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
kfree_skb(skb);
break;
} else {
DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x/n",
tun‐>dev‐>name, addr[0], addr[1], addr[2],
addr[3], addr[4], addr[5]);
kfree_skb(skb);
continue;
}
}
current‐>state = TASK_RUNNING;
remove_wait_queue(&tun‐>read_wait, &wait);
return ret;
}
skb_dequeue(src/net/core/skbuff.c)
/**
* skb_dequeue ‐ remove from the head of the queue
* @list: list to dequeue from
*
* Remove the head of the list. The list lock is taken so the function
* may be used safely with other locking list functions. The head item is
* returned or %NULL if the list is empty.
*/
struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
unsigned long flags;
struct sk_buff *result;
spin_lock_irqsave(&list‐>lock, flags);
result = __skb_dequeue(list);
spin_unlock_irqrestore(&list‐>lock, flags);
return result;
}
__skb_dequeue
/**
* __skb_dequeue ‐ remove from the head of the queue
* @list: list to dequeue from
*
* Remove the head of the list. This function does not take any locks
* so must be used with appropriate locks held only. The head item is
* returned or %NULL if the list is empty.
*/
extern struct sk_buff *skb_dequeue(struct sk_buff_head *list);
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
struct sk_buff *next, *prev, *result;
prev = (struct sk_buff *) list;
next = prev‐>next;
result = NULL;
if (next != prev) {
result = next;
next = next‐>next;
list‐>qlen‐‐;
next‐>prev = prev;
prev‐>next = next;
result‐>next = result‐>prev = NULL;
}
return result;
}
tun_put_user
/* Put packet to the user space buffer */
static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
struct sk_buff *skb,
struct iovec *iv, int len)
{
struct tun_pi pi = { 0, skb‐>protocol };
ssize_t total = 0;
if (!(tun‐>flags & TUN_NO_PI)) {
if ((len ‐= sizeof(pi)) < 0)
return ‐EINVAL;
if (len < skb‐>len) {
/* Packet will be striped */
pi.flags |= TUN_PKT_STRIP;
}
if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
return ‐EFAULT;
total += sizeof(pi);
}
len = min_t(int, skb‐>len, len);
skb_copy_datagram_iovec(skb, 0, iv, len);
total += len;
tun‐>stats.tx_packets++;
tun‐>stats.tx_bytes += len;
return total;
}
tun_chr_aio_write(把数据写入到tun 设备中)
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
unsigned long count, loff_t pos)
{
struct tun_struct *tun = iocb‐>ki_filp‐>private_data;
if (!tun)
return ‐EBADFD;
DBG(KERN_INFO "%s: tun_chr_write %ld/n", tun‐>dev‐>name, count);
return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count));
}
tun_get_user
/* Get packet from user space buffer */
static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
{
struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
struct sk_buff *skb;
size_t len = count, align = 0;
if (!(tun‐>flags & TUN_NO_PI)) {
if ((len ‐= sizeof(pi)) > count)
return ‐EINVAL;
if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
return ‐EFAULT;
}
if ((tun‐>flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
align = NET_IP_ALIGN;
if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
tun‐>stats.rx_dropped++;
return ‐ENOMEM;
}
if (align)
skb_reserve(skb, align);
if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
tun‐>stats.rx_dropped++;
kfree_skb(skb);
return ‐EFAULT;
}
switch (tun‐>flags & TUN_TYPE_MASK) {
case TUN_TUN_DEV:
skb_reset_mac_header(skb);
skb‐>protocol = pi.proto;
skb‐>dev = tun‐>dev;
break;
case TUN_TAP_DEV:
skb‐>protocol = eth_type_trans(skb, tun‐>dev);
break;
};
if (tun‐>flags & TUN_NOCHECKSUM)
skb‐>ip_summed = CHECKSUM_UNNECESSARY;
netif_rx_ni(skb);
tun‐>dev‐>last_rx = jiffies;
tun‐>stats.rx_packets++;
tun‐>stats.rx_bytes += len;
return count;
}