这篇主要介绍net_device设备的注册,初始化等流程
如今的网卡无论是千兆卡还是万兆卡,基本都是pci设备(万兆卡基本上都是pcie设备),我们首先来看下pci设备相关的操作
struct pci_device_id {
__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/
__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
__u32 class, class_mask; /* (class,subclass,prog-if) triplet */
kernel_ulong_t driver_data; /* Data private to the driver */
};
这个是pci设备的配置空间里的一些项组成的结构
struct pci_driver {
struct list_head node;
char *name;
const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */
int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */
void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */
int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */
int (*suspend_late) (struct pci_dev *dev, pm_message_t state);
int (*resume_early) (struct pci_dev *dev);
int (*resume) (struct pci_dev *dev); /* Device woken up */
void (*shutdown) (struct pci_dev *dev);
struct pci_error_handlers *err_handler;
struct device_driver driver;
struct pci_dynids dynids;
/* RHEL6: padding to add future features to the pci_driver struct */
void *rh_reserved;
};
这个结构是pci_register_driver/pci_unregister_driver要用到的结构,也是pci设备的核心结构
拿intel 万兆网卡的驱动为例:
static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AF_DUAL_PORT), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AF_SINGLE_PORT), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AT), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AT2), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_CX4), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_CX4_DUAL_PORT), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_DA_DUAL_PORT), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_XF_LR), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_SFP_LOM), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_BX), board_82598 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KX4), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_XAUI_LOM), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KR), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_EM), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KX4_MEZZ), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_CX4), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_BACKPLANE_FCOE), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_FCOE), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_T3_LOM), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_COMBO_BACKPLANE), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540T), board_X540 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_SF2), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_LS), board_82599 },
{PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599EN_SFP), board_82599 },
/* required last entry */
{0, }
};
MODULE_DEVICE_TABLE(pci, ixgbe_pci_tbl);
static struct pci_driver ixgbe_driver = {
.name = ixgbe_driver_name,
.id_table = ixgbe_pci_tbl,
.probe = ixgbe_probe,
.remove = __devexit_p(ixgbe_remove),
#ifdef CONFIG_PM
.suspend = ixgbe_suspend,
.resume = ixgbe_resume,
#endif
.shutdown = ixgbe_shutdown,
.err_handler = &ixgbe_err_handler
};
ixgbe_pci_probe就是设备被pci总线探测之后安装的函数,反之如果要卸载调用ixgbe_pci_remove
还是拿intel ixgbe网卡为例,ixgbe_probe会调用alloc_etherdev_mq创建一个net_device结构,而alloc_etherdev_mq实际调用了alloc_etherdev_mqs(现在的驱动已经不调alloc_netdev这种方法了)
alloc_etherdev_mqs首先判断txqs(发送队列个数),rxqs(接收队列个数),如果小于1报错返回
下面开始分配一段连续空间,大小为net_device结构大小加上驱动私有区域空间大小,对intel ixgbe驱动而言,是一个ixgbe_adapter的结构体,一般都是net_device的连续空间在前,ixgbe_adapter在后
netdev->netdev_ops = &ixgbe_netdev_ops,把驱动的底层方法注册到net_device结构中
alloc_etherdev_mqs会调用ether_setup函数
void ether_setup(struct net_device *dev)
{
dev->header_ops = ð_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
dev->tx_queue_len = 1000; /* Ethernet wants good queues */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
netdev_extended(dev)->ext_priv_flags = IFF_TX_SKB_SHARING;
memset(dev->broadcast, 0xFF, ETH_ALEN);
}
最终通过register_netdev/unregister_netdev来注册,注销网络设备
register_netdev如下:
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock();
/*
* If the name is a format string the caller wants us to do a
* name allocation.
*/
if (strchr(dev->name, '%')) {
err = dev_alloc_name(dev, dev->name);
if (err < 0)
goto out;
}
err = register_netdevice(dev);
out:
rtnl_unlock();
return err;
}
内核把netdevice结构通过list_head, hlist_node串联起来,其中
struct list_head dev_list,是链表连接的所有net_device;struct hlist_node name_hlist,是基于名字哈希的,struct hlist_node index_hlist,是基于设备index哈希的
下面分析register_netdevice函数:
if (!netdev_extended(dev)->rps_data.num_rx_queues) {
/*
* Allocate a single RX queue if driver never called
* alloc_netdev_mq
*/
netdev_extended(dev)->rps_data.num_rx_queues = 1;
ret = netif_alloc_rx_queues(dev);
if (ret)
goto out;
}
如果num_rx_queues为0,即没有接收队列,于是调用netif_alloc_rx_queues创建一个接收队列
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
调用驱动提供的 init 函数初始化设备
dev->ifindex = dev_new_index(net);
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
设置dev->ifindex, dev->iflink
检查NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, NETIF_F_IPV6_CSUM,NETIF_F_SG,NETIF_F_GSO,NETIF_F_GRO等标签
在register_netdevice / unregister_netdevice 之后,会调用rtnl_unlock函数,里面又执行 net_todo_list 中的 todo list ,分析 netdev_run_todo 如下:
void netdev_run_todo(void)
{
struct list_head list;
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
while (!list_empty(&list)) {
struct net_device *dev
= list_entry(list.next, struct net_device, todo_list);
list_del(&dev->todo_list);
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
printk(KERN_ERR "network todo '%s' but state %d\n",
dev->name, dev->reg_state);
dump_stack();
continue;
}
dev->reg_state = NETREG_UNREGISTERED;
on_each_cpu(flush_backlog, dev, 1);
netdev_wait_allrefs(dev);
/* paranoia */
BUG_ON(atomic_read(&dev->refcnt));
WARN_ON(dev->ip_ptr);
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
if (dev->destructor)
dev->destructor(dev);
/* Free network device */
kobject_put(&dev->dev.kobj);
}
}
首先list_replace_init 用全局net_todo_list 代替 list
接下来进入一个循环,对于net_todo_list里的所有todo list,用container_of 宏得到 net_device,把设备设置为 NETREG_UNREGISTERED 状态,接着调用 netdev_wait_allrefs 等待设备的引用计数清0,一旦为0,调用dev->destructor 释放资源(一般dev->destructor 会调用 free_netdev)
netdev_wait_allrefs 分析如下:
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
rebroadcast_time = warning_time = jiffies;
while (atomic_read(&dev->refcnt) != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
* pending on unregister. If this
* happens, we simply run the queue
* unscheduled, resulting in a noop
* for this device.
*/
linkwatch_run_queue();
}
__rtnl_unlock();
rebroadcast_time = jiffies;
}
msleep(250);
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage "
"count = %d\n",
dev->name, atomic_read(&dev->refcnt));
warning_time = jiffies;
}
}
}
只要设备的 dev->refcnt 不为0,就调用call_netdevice_notifiers(NETDEV_UNREGISTER, dev) 给 notification chain 上的所有子系统发送通知,