linux内核网络协议栈学习笔记(2)

时间:2021-10-09 11:07:53

这篇主要介绍net_device设备的注册,初始化等流程

如今的网卡无论是千兆卡还是万兆卡,基本都是pci设备(万兆卡基本上都是pcie设备),我们首先来看下pci设备相关的操作


struct pci_device_id {  
    __u32 vendor, device;       /* Vendor and device ID or PCI_ANY_ID*/
    __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
    __u32 class, class_mask;    /* (class,subclass,prog-if) triplet */
    kernel_ulong_t driver_data; /* Data private to the driver */
};

这个是pci设备的配置空间里的一些项组成的结构


struct pci_driver {
    struct list_head node;
    char *name;
    const struct pci_device_id *id_table;   /* must be non-NULL for probe to be called */
    int  (*probe)  (struct pci_dev *dev, const struct pci_device_id *id);   /* New device inserted */
    void (*remove) (struct pci_dev *dev);   /* Device removed (NULL if not a hot-plug capable driver) */
    int  (*suspend) (struct pci_dev *dev, pm_message_t state);  /* Device suspended */
    int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
    int  (*resume_early) (struct pci_dev *dev);
    int  (*resume) (struct pci_dev *dev);                   /* Device woken up */
    void (*shutdown) (struct pci_dev *dev);
    struct pci_error_handlers *err_handler;
    struct device_driver    driver;
    struct pci_dynids dynids;
    /* RHEL6: padding to add future features to the pci_driver struct */
    void *rh_reserved; 
};            

这个结构是pci_register_driver/pci_unregister_driver要用到的结构,也是pci设备的核心结构


拿intel 万兆网卡的驱动为例:

static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AF_DUAL_PORT), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AF_SINGLE_PORT), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AT), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598AT2), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_CX4), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_CX4_DUAL_PORT), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_DA_DUAL_PORT), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_XF_LR), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598EB_SFP_LOM), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82598_BX), board_82598 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KX4), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_XAUI_LOM), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KR), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_EM), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_KX4_MEZZ), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_CX4), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_BACKPLANE_FCOE), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_FCOE), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_T3_LOM), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_COMBO_BACKPLANE), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540T), board_X540 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_SFP_SF2), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_LS), board_82599 },
    {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599EN_SFP), board_82599 },
    /* required last entry */
    {0, }
};
MODULE_DEVICE_TABLE(pci, ixgbe_pci_tbl);


static struct pci_driver ixgbe_driver = {
    .name     = ixgbe_driver_name,
    .id_table = ixgbe_pci_tbl,
    .probe    = ixgbe_probe,
    .remove   = __devexit_p(ixgbe_remove),
#ifdef CONFIG_PM
    .suspend  = ixgbe_suspend,
    .resume   = ixgbe_resume,
#endif 
    .shutdown = ixgbe_shutdown,
    .err_handler = &ixgbe_err_handler
};


ixgbe_pci_probe就是设备被pci总线探测之后安装的函数,反之如果要卸载调用ixgbe_pci_remove

还是拿intel ixgbe网卡为例,ixgbe_probe会调用alloc_etherdev_mq创建一个net_device结构,而alloc_etherdev_mq实际调用了alloc_etherdev_mqs(现在的驱动已经不调alloc_netdev这种方法了)


alloc_etherdev_mqs首先判断txqs(发送队列个数),rxqs(接收队列个数),如果小于1报错返回

下面开始分配一段连续空间,大小为net_device结构大小加上驱动私有区域空间大小,对intel ixgbe驱动而言,是一个ixgbe_adapter的结构体,一般都是net_device的连续空间在前,ixgbe_adapter在后

netdev->netdev_ops = &ixgbe_netdev_ops,把驱动的底层方法注册到net_device结构中


alloc_etherdev_mqs会调用ether_setup函数

void ether_setup(struct net_device *dev)
{
    dev->header_ops     = &eth_header_ops;
    dev->type       = ARPHRD_ETHER;
    dev->hard_header_len    = ETH_HLEN;
    dev->mtu        = ETH_DATA_LEN;
    dev->addr_len       = ETH_ALEN;
    dev->tx_queue_len   = 1000; /* Ethernet wants good queues */
    dev->flags      = IFF_BROADCAST|IFF_MULTICAST;
    netdev_extended(dev)->ext_priv_flags = IFF_TX_SKB_SHARING;
    memset(dev->broadcast, 0xFF, ETH_ALEN);
}   

最终通过register_netdev/unregister_netdev来注册,注销网络设备


register_netdev如下:

int register_netdev(struct net_device *dev)
{
    int err;
    rtnl_lock();
    /*
     * If the name is a format string the caller wants us to do a
     * name allocation.
     */
    if (strchr(dev->name, '%')) {
        err = dev_alloc_name(dev, dev->name);
        if (err < 0)
            goto out;
    }
    err = register_netdevice(dev);
out:
    rtnl_unlock();
    return err;
}


内核把netdevice结构通过list_head, hlist_node串联起来,其中

struct list_head dev_list,是链表连接的所有net_device;struct hlist_node name_hlist,是基于名字哈希的,struct hlist_node index_hlist,是基于设备index哈希的


下面分析register_netdevice函数:

    if (!netdev_extended(dev)->rps_data.num_rx_queues) {
        /*
         * Allocate a single RX queue if driver never called
         * alloc_netdev_mq
         */
        netdev_extended(dev)->rps_data.num_rx_queues = 1;
        ret = netif_alloc_rx_queues(dev);
        if (ret)    
            goto out;   
    }

如果num_rx_queues为0,即没有接收队列,于是调用netif_alloc_rx_queues创建一个接收队列

    /* Init, if this function is available */
    if (dev->netdev_ops->ndo_init) {
        ret = dev->netdev_ops->ndo_init(dev);
        if (ret) {
            if (ret > 0)
                ret = -EIO;
            goto out;
        }
    }

调用驱动提供的 init 函数初始化设备

dev->ifindex = dev_new_index(net);
    if (dev->iflink == -1)
        dev->iflink = dev->ifindex;

设置dev->ifindex, dev->iflink

检查NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, NETIF_F_IPV6_CSUM,NETIF_F_SG,NETIF_F_GSO,NETIF_F_GRO等标签


在register_netdevice / unregister_netdevice 之后,会调用rtnl_unlock函数,里面又执行 net_todo_list 中的 todo list ,分析 netdev_run_todo 如下:

void netdev_run_todo(void)
{           
    struct list_head list;
            
    /* Snapshot list, allow later requests */
    list_replace_init(&net_todo_list, &list);

    __rtnl_unlock();

    while (!list_empty(&list)) {
        struct net_device *dev
            = list_entry(list.next, struct net_device, todo_list);
        list_del(&dev->todo_list);

        if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
            printk(KERN_ERR "network todo '%s' but state %d\n",
                   dev->name, dev->reg_state);
            dump_stack();
            continue;
        }

        dev->reg_state = NETREG_UNREGISTERED;
        on_each_cpu(flush_backlog, dev, 1);
        netdev_wait_allrefs(dev);
        /* paranoia */
        BUG_ON(atomic_read(&dev->refcnt));
        WARN_ON(dev->ip_ptr);
        WARN_ON(dev->ip6_ptr);
        WARN_ON(dev->dn_ptr);

        if (dev->destructor)
            dev->destructor(dev);

        /* Free network device */
        kobject_put(&dev->dev.kobj);
    }
}

首先list_replace_init 用全局net_todo_list 代替 list

接下来进入一个循环,对于net_todo_list里的所有todo list,用container_of 宏得到 net_device,把设备设置为 NETREG_UNREGISTERED 状态,接着调用 netdev_wait_allrefs 等待设备的引用计数清0,一旦为0,调用dev->destructor 释放资源(一般dev->destructor 会调用 free_netdev)


netdev_wait_allrefs 分析如下:

static void netdev_wait_allrefs(struct net_device *dev)
{           
    unsigned long rebroadcast_time, warning_time;
            
    rebroadcast_time = warning_time = jiffies;
    while (atomic_read(&dev->refcnt) != 0) {
        if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
            rtnl_lock();
    
            /* Rebroadcast unregister notification */
            call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
   
            if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
                     &dev->state)) {
                /* We must not have linkwatch events
                 * pending on unregister. If this
                 * happens, we simply run the queue
                 * unscheduled, resulting in a noop
                 * for this device.
                 */
                linkwatch_run_queue();
            }
    
            __rtnl_unlock();

            rebroadcast_time = jiffies;
        }
    
        msleep(250);

        if (time_after(jiffies, warning_time + 10 * HZ)) {
            printk(KERN_EMERG "unregister_netdevice: "
                   "waiting for %s to become free. Usage "
                   "count = %d\n",
                   dev->name, atomic_read(&dev->refcnt));
            warning_time = jiffies;
        }
    }   
}       

只要设备的 dev->refcnt 不为0,就调用call_netdevice_notifiers(NETDEV_UNREGISTER, dev) 给 notification chain 上的所有子系统发送通知,