成都的天气好像越来越好了,前几天还穿着穿着外套直打哆嗦,到今天已经“拨开阴云见太阳”,暖洋洋的,心情也暖洋洋的。暖和的正好想睡觉。打个呵欠,把网络设备管理这部份总结下吧。
Linux素以优秀的网络管理能力而著称,linux为何具有这么高的效率?我们从网络设备的管理说起。
Linux为何要对网络设备单独管理呢?这是因为。协议栈很多地方都会涉及到网络设备。小至IP地址的设置。大至IP路由的更新。都离不开高效的网络设备管理。将网络设备单独管理可以提高效率!
每个网络设备,在linux中都会对应一个数据结构,net_device。 就从这个结构说起
Linux 2。6。21中,对net_device定义如下:
struct net_device
{
//设备的名称,例如常见的“eth0”等
char name[IFNAMSIZ];
//共享内存的起始,结束地址
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
//网络设备的I/O基地址
unsigned long base_addr; /* device I/O address */
//被赋予的中断号
unsigned int irq; /* device IRQ number */
//在多端口设备上使用哪一个端口
unsigned char if_port; /* Selectable AUI, TP,..*/
//为设备分配的DMA通道
unsigned char dma; /* DMA channel */
//设备的状态
unsigned long state;
// 下一个net_device
struct net_device *next;
//初始化函数。
int (*init)(struct net_device *dev);
struct net_device *next_sched;
/* Interface index. Unique device identifier */
//设备在内核中对应的序号
int ifindex;
int iflink;
//获得接口状态的函数指针
struct net_device_stats* (*get_stats)(struct net_device *dev);
struct iw_statistics* (*get_wireless_stats)(struct net_device *dev);
struct iw_handler_def * wireless_handlers;
struct ethtool_ops *ethtool_ops;
//传输状态。检查传输是否被锁住
unsigned long trans_start; /* Time (in jiffies) of last Tx */
//最使使用的时间
unsigned long last_rx; /* Time of last Rx */
//接口标志
unsigned short flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short unused_alignment_fixer; /* Because we need priv_flags,
* and we want to be 32-bit aligned.
*/
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
void *priv; /* pointer to private data */
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
unsigned char addr_len; /* hardware address length */
struct dev_mc_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
int promiscuity;
int allmulti;
int watchdog_timeo;
struct timer_list watchdog_timer;
/* Protocol specific pointers */
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct list_head poll_list; /* Link to poll list */
int quota;
int weight;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct Qdisc *qdisc_ingress;
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
/* ingress path synchronizer */
spinlock_t ingress_lock;
/* hard_start_xmit synchronizer */
spinlock_t xmit_lock;
/* cpu id of processor entered to hard_start_xmit or -1,
if nobody entered there.
*/
int xmit_lock_owner;
/* device queue lock */
spinlock_t queue_lock;
/* Number of references to this device */
atomic_t refcnt;
/* delayed register/unregister */
struct list_head todo_list;
/* device name hash chain */
struct hlist_node name_hlist;
/* device index hash chain */
struct hlist_node index_hlist;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERING, /* called register_netdevice */
NETREG_REGISTERED, /* completed register todo */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
} reg_state;
/* Net device features */
int features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum only TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
#define NETIF_F_LLTX 4096 /* LockLess TX */
/* Called after device is detached from network. */
void (*uninit)(struct net_device *dev);
/* Called after last user reference disappears. */
void (*destructor)(struct net_device *dev);
/* Pointers to interface service routines. */
//打开函数指针
int (*open)(struct net_device *dev);
//设备停用时调用此函数
int (*stop)(struct net_device *dev);
//初始化数据包的传输
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
#define HAVE_NETDEV_POLL
//轮询函数
int (*poll) (struct net_device *dev, int *quota);
//建立硬件头信息
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
void *daddr,
void *saddr,
unsigned len);
//ARP解析之后,重构头部
int (*rebuild_header)(struct sk_buff *skb);
#define HAVE_MULTICAST
//多播支持函数
void (*set_multicast_list)(struct net_device *dev);
#define HAVE_SET_MAC_ADDR
int (*set_mac_address)(struct net_device *dev,
void *addr);
#define HAVE_PRIVATE_IOCTL
int (*do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
#define HAVE_SET_CONFIG
int (*set_config)(struct net_device *dev,
struct ifmap *map);
#define HAVE_HEADER_CACHE
int (*hard_header_cache)(struct neighbour *neigh,
struct hh_cache *hh);
void (*header_cache_update)(struct hh_cache *hh,
struct net_device *dev,
unsigned char * haddr);
#define HAVE_CHANGE_MTU
int (*change_mtu)(struct net_device *dev, int new_mtu);
#define HAVE_TX_TIMEOUT
void (*tx_timeout) (struct net_device *dev);
void (*vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
int (*hard_header_parse)(struct sk_buff *skb,
unsigned char *haddr);
int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
int (*accept_fastpath)(struct net_device *, struct dst_entry*);
#ifdef CONFIG_NETPOLL
int netpoll_rx;
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*poll_controller)(struct net_device *dev);
#endif
/* bridge stuff */
//对应的网桥端口(以后分析)
struct net_bridge_port *br_port;
#ifdef CONFIG_NET_DIVERT
/* this will get initialized at each interface type init routine */
struct divert_blk *divert;
#endif /* CONFIG_NET_DIVERT */
/* class/net/name entry */
struct class_device class_dev;
/* how much padding had been added by alloc_netdev() */
int padded;
}
晕,太多的成员。太庞大了。不要紧,等到要使用到相应成员的时候再来解释好了。
注意到这么庞大的结构中,有个成员叫: struct net_device *next,呵呵,很熟悉吧,就是用它来建立网络设备的链表。
每一个网络设备启动的时候,都会调用register_netdev() (drivers/net/net_init.c)
跟踪这个函数:
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock();
/*
* If the name is a format string the caller wants us to
* do a name allocation
*/
if (strchr(dev->name, '%'))
{
err = dev_alloc_name(dev, dev->name);
if (err < 0)
goto out;
}
/*
* Back compatibility hook. Kill this one in 2.5
*/
if (dev->name[0]==0 || dev->name[0]==' ')
{
err = dev_alloc_name(dev, "eth%d");
if (err < 0)
goto out;
}
err = register_netdevice(dev);
out:
rtnl_unlock();
return err;
}
跟踪至: register_netdevice(struct net_device *dev) (net/core/dev.c)
int register_netdevice(struct net_device *dev)
{
struct hlist_head *head;
struct hlist_node *p;
int ret;
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
spin_lock_init(&dev->queue_lock);
spin_lock_init(&dev->xmit_lock);
dev->xmit_lock_owner = -1;
#ifdef CONFIG_NET_CLS_ACT
spin_lock_init(&dev->ingress_lock);
#endif
ret = alloc_divert_blk(dev);
if (ret)
goto out;
dev->iflink = -1;
/* Init, if this function is available */
//如果dev -> init 被赋值,那么调用此函数
if (dev->init) {
ret = dev->init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out_err;
}
}
//判断name 是否合法
if (!dev_valid_name(dev->name)) {
ret = -EINVAL;
goto out_err;
}
//为此设备分配一个index
dev->ifindex = dev_new_index();
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
/* Check for existence of name */
//所有网络设备,以名字作为哈希主键存在dev_name_head中,该变量是一个哈希数组
//找到该名字对应的链表
//如果内核中已经含有此名字的网络设备,出错退出
head = dev_name_hash(dev->name);
hlist_for_each(p, head) {
struct net_device *d
= hlist_entry(p, struct net_device, name_hlist);
if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
ret = -EEXIST;
goto out_err;
}
}
/* Fix illegal SG+CSUM combinations. */
if ((dev->features & NETIF_F_SG) &&
!(dev->features & (NETIF_F_IP_CSUM |
NETIF_F_NO_CSUM |
NETIF_F_HW_CSUM))) {
printk("%s: Dropping NETIF_F_SG since no checksum feature./n",
dev->name);
dev->features &= ~NETIF_F_SG;
}
/*
* nil rebuild_header routine,
* that should be never called and used as just bug trap.
*/
//为rebuild_header赋默认值
if (!dev->rebuild_header)
dev->rebuild_header = default_rebuild_header;
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
dev->next = NULL;
dev_init_scheduler(dev);
write_lock_bh(&dev_base_lock);
//初始化的时候,有struct net_device **dev_tail = &dev_base;
//这段代码的意思实际就是:把dev加入dev_base为首结点队链表的尾部
*dev_tail = dev;
dev_tail = &dev->next;
//把此结点加入到以名字为哈希主键的链表数组dev_name_head中
hlist_add_head(&dev->name_hlist, head);
//把此结点加到以序号为主键的链表数组dev_index_head中
hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
dev_hold(dev);
dev->reg_state = NETREG_REGISTERING;
write_unlock_bh(&dev_base_lock);
/* Notify protocols, that a new device appeared. */
//在通知链表上发送事件
notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
/* Finish registration after unlock */
net_set_todo(dev);
ret = 0;
out:
return ret;
out_err:
free_divert_blk(dev);
goto out;
}
从此可以看出。新加入一个设备时,会插入三个位置:以名字为哈希值组织的dev_name_head ,以序号为主链的哈希数组dev_index_head.还有dev_base.它为快速查找网络设备提供了基础。事实上。在内核中,经常要根据index找到dev. 或者根据name找到dev.我们遇到的时候再分析
到现在,我们可以在内核中顺藤摸瓜的找到每一个网络设备了。
还有很重要的。设备更改了配置,要怎么通知跟他相关的子系统呢?例如,网卡更新了IP,如何使路由得到更新?
接着往下看:
注意到上面注册代码中所调用的一个函数notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev).
该函数的作用是,在通知链表上netdev_chain上发送NETDEV_REGISTER消息,所有在与该通知链表关联的子系统都可以收到此消息。以此,可以快速的更新整个系统的配置消息。
以路由子系统为例,来讲述该过程:
在IPV4子系统加载的时候,加调用ip_init(),接着调用fib_init(),然后再调用ip_fib_init()
跟踪一下此函数:
void __init ip_fib_init(void)
{
#ifndef CONFIG_IP_MULTIPLE_TABLES
ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
#else
fib_rules_init();
#endif
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
}
register_netdevice_notifier是做什么的呢?往下跟踪:
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net_device *dev;
int err;
rtnl_lock();
//注册通知链
err = notifier_chain_register(&netdev_chain, nb);
if (!err) {
for (dev = dev_base; dev; dev = dev->next) {
nb->notifier_call(nb, NETDEV_REGISTER, dev);
if (dev->flags & IFF_UP)
nb->notifier_call(nb, NETDEV_UP, dev);
}
}
rtnl_unlock();
return err;
}
呵呵,它在netdev_chain上注册了通知链,当此链上有事件发生时,会调用fib_netdev_notifiers中的相关信息处理,看一下fib_netdev_notifier的信息:
struct notifier_block fib_netdev_notifier = {
.notifier_call =fib_netdev_event,
};
OK,现在越来越具体了,如果netdev_chain有事件,会调用fib_netdev_event处理。继续跟踪:
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
struct in_device *in_dev = __in_dev_get(dev);
//设备注销
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
return NOTIFY_DONE;
}
if (!in_dev)
return NOTIFY_DONE;
switch (event) {
//设备UP
case NETDEV_UP:
for_ifa(in_dev) {
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
rt_cache_flush(-1);
break;
//设备DOWN
case NETDEV_DOWN:
fib_disable_ip(dev, 0);
break;
//设备参数改变
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
rt_cache_flush(0);
break;
}
return NOTIFY_DONE;
}
路由部份的代码将在后续的笔记中给出。至此,整个网络设备的架构非常的清晰了!