linux内核网络协议栈学习笔记(6)

时间:2021-11-06 11:04:05


本篇讨论IP包的收发(暂不包括路由)

先来看inet_init,

首先是调用proto_register,注册了tcp_prot, udp_prot, raw_prot,其中proto_register前半部分是初始化各种slab_cache,后半部分把这些struct proto结构链到proto_list里

其次调用sock_register,内核有一个全局的net_proto_family结构的net_families数组,inet_init调用sock_register就是把inet_family_ops加到net_families[PF_NET]中,inet_family_ops结构如下

static struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner  = THIS_MODULE,
};

接着调用inet_add_protocol,去填充inet_protos数组,inet_protos是一个全局的指针数组,其定义如下:

const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;

可以看出数组最大长度MAX_INET_PROTOS为256,in.h里对所有的协议做了定义

/* Standard well-defined IP protocols.  */
enum {
  IPPROTO_IP = 0,       /* Dummy protocol for TCP       */
  IPPROTO_ICMP = 1,     /* Internet Control Message Protocol    */
  IPPROTO_IGMP = 2,     /* Internet Group Management Protocol   */
  IPPROTO_IPIP = 4,     /* IPIP tunnels (older KA9Q tunnels use 94) */
  IPPROTO_TCP = 6,      /* Transmission Control Protocol    */
  IPPROTO_EGP = 8,      /* Exterior Gateway Protocol        */
  IPPROTO_PUP = 12,     /* PUP protocol             */
  IPPROTO_UDP = 17,     /* User Datagram Protocol       */
  IPPROTO_IDP = 22,     /* XNS IDP protocol         */
  IPPROTO_DCCP = 33,        /* Datagram Congestion Control Protocol */
  IPPROTO_RSVP = 46,        /* RSVP protocol            */
  IPPROTO_GRE = 47,     /* Cisco GRE tunnels (rfc 1701,1702)    */
  IPPROTO_IPV6   = 41,      /* IPv6-in-IPv4 tunnelling      */
  IPPROTO_ESP = 50,            /* Encapsulation Security Payload protocol */
  IPPROTO_AH = 51,             /* Authentication Header protocol       */
  IPPROTO_BEETPH = 94,         /* IP option pseudo header for BEET */
  IPPROTO_PIM    = 103,     /* Protocol Independent Multicast   */
  IPPROTO_COMP   = 108,                /* Compression Header protocol */
  IPPROTO_SCTP   = 132,     /* Stream Control Transport Protocol    */
  IPPROTO_UDPLITE = 136,    /* UDP-Lite (RFC 3828)          */
  IPPROTO_RAW    = 255,     /* Raw IP packets           */
  IPPROTO_MAX
};

inet_init里对inet_protos里只定义了ICMP, IGMP, TCP, UDP,以TCP为例,其net_protocol定义为

static const struct net_protocol tcp_protocol = {
    .handler =  tcp_v4_rcv,
    .err_handler =  tcp_v4_err,
    .gso_send_check = tcp_v4_gso_send_check,
    .gso_segment =  tcp_tso_segment,
    .gro_receive =  tcp4_gro_receive,
    .gro_complete = tcp4_gro_complete,
    .no_policy =    1,
    .netns_ok = 1,
};

IP层在把报文往上送的时候,e.g. ip_local_deliver_finish,实际上就是根据skb的protocol在inet_protos里找到对应的net_protocol结构,然后调用net_protocol->handler函数,e.g. 如果是TCP协议的skb,这时就调用tcp_v4_rcv

下面开始初始化inetsw数组以及inetsw_arry数组,inetsw是个list_head数组,每个索引代表了IP报的一种类型(由四层决定的),如 SOCK_STREAM, SOCK_DGRAM, SOCK_RAW等,定义如下

enum sock_type {
    SOCK_STREAM = 1,
    SOCK_DGRAM  = 2,
    SOCK_RAW    = 3,
    SOCK_RDM    = 4,
    SOCK_SEQPACKET  = 5,
    SOCK_DCCP   = 6,
    SOCK_PACKET = 10,
};  

inetsw_array数组是一个inet_protosw类型的数组,定义如下

static struct inet_protosw inetsw_array[] =
{       
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },
    
    {   
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },
    
       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

而inet_protosw定义如下

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {  
    struct list_head list;        
    /* These two fields form the lookup key.  */
    unsigned short   type;     /* This is the 2nd argument to socket(2). */
    unsigned short   protocol; /* This is the L4 protocol number.  */        
    struct proto     *prot;
    const struct proto_ops *ops;
    char             no_check;   /* checksum on rcv/xmit/none? */
    unsigned char    flags;      /* See INET_PROTOSW_* below.  */
};         
#define INET_PROTOSW_REUSE 0x01      /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02  /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */

可以看出inet_protosw的list就是inetsw指向的list_head指针

最后是分别调用 arp_init, ip_init, tcp_v4_init, tcp_init, udp_init 等,这里略过了


下面来谈IP协议,这里我们略过IP option部分,因为实际应用的网络几乎不会有IP option出现,先看IP头部

struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
    __u8    ihl:4,
        version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
    __u8    version:4,
        ihl:4;
#else
#error  "Please fix <asm/byteorder.h>"
#endif
    __u8    tos;
    __be16  tot_len;
    __be16  id;
    __be16  frag_off;
    __u8    ttl;
    __u8    protocol;
    __sum16 check;
    __be32  saddr;
    __be32  daddr;
    /*The options start here. */
};

ihl单位是4字节,一般而言ihl长度是20字节因此是这个值是5

tot_len单位是字节

id一般用于IP的分段/组合,同一IP包的所有分段其ID值是相同的

protocol表示4层协议值

check是IP首部的校检和


sk_buff 结构中,skb->csum保存了L4的校验和,skb->ip_summed表示校验和的状态

CHECKSUM_NONE,表示L4校验和无效,需要重新计算

CHECKSUM_HW,表示网卡已经正确计算了L4校验和,但程序需要再次验证L4校验和

CHECKSUM_UNNECESSARY,表示L4校验和无需验证


static struct packet_type ip_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
    .gso_send_check = inet_gso_send_check,
    .gso_segment = inet_gso_segment,
    .gro_receive = inet_gro_receive,
    .gro_complete = inet_gro_complete,
};

L2层通过ip_packet_type找到ip_rcv函数,从而把报文传到L3,下面分析下ip_rcv 函数:

    /* When the interface is in promisc. mode, drop all the crap
     * that it receives, do not try to analyse it.
     */
    if (skb->pkt_type == PACKET_OTHERHOST)
        goto drop; 
        
    IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto out;
    }   
        
    if (!pskb_may_pull(skb, sizeof(struct iphdr)))
        goto inhdr_error;
    
    iph = ip_hdr(skb);

如果skb是通过混杂模式获取的且不是发往本机的,直接丢弃;如果skb是share的,调用skb_share_check复制一份出来处理;pskb_may_pull这个函数比较复杂,其目的是,确保在skb->data开始的线性内存里面至少有 iphdr 的内容(这里要提下sk_buff这个结构的复杂性就在于:真正的报文内容很多情况下是不存在skb所在的线性内存中的,通常情况下,sk_buff后面会跟着一块线性内存空间,用skb_shared_info来表示,如果IP包没有分片的话,这里会存储scatter-gather的报文内容,这些内容是分散在各个不同的内存页中的,用一个 skb_frag_t 数组frags表示,nrfrags里保存了数组中元素的个数;如果IP包存在分片的话,可以看到有个sk_buff的数组frag_list,里面就是分片的skb咯),如果skb->data后续的内存不够,pskb_may_pull会扩充这个skb结构,然后把frags或者frag_list里的IP头内容拷出来填到skb线性内存里

    if (iph->ihl < 5 || iph->version != 4)
        goto inhdr_error;

    if (!pskb_may_pull(skb, iph->ihl*4))
        goto inhdr_error;

    iph = ip_hdr(skb);

    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
        goto inhdr_error;

    len = ntohs(iph->tot_len);
    if (skb->len < len) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
        goto drop;
    } else if (len < (iph->ihl*4))
        goto inhdr_error;

这段代码基本都是做一些check,略过了

    /* Our transport medium may have padded the buffer out. Now we know it
     * is IP we can trim to the true length of the frame.
     * Note this now means skb->len holds ntohs(iph->tot_len).
     */
    if (pskb_trim_rcsum(skb, len)) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto drop;
    }

pskb_trim_rcsum用于去掉L2用来padding的部分,并重新计算checksum,了解下就行了


return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
               ip_rcv_finish);

最后走一遍netfilter,如果不被DROP或啥的,进入ip_rcv_finish


static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;

    /*
     *  Initialise the virtual path cache for the packet. It describes
     *  how the packet travels inside Linux networking.
     */
    if (skb_dst(skb) == NULL) {
        int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
                     skb->dev);
        if (unlikely(err)) {
            if (err == -EHOSTUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INADDRERRORS);
            else if (err == -ENETUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INNOROUTES);
            goto drop;
        }   
    }

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb_rtable(skb);
    if (rt->rt_type == RTN_MULTICAST) {
        IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
                skb->len);
    } else if (rt->rt_type == RTN_BROADCAST)
        IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
                skb->len);

    return dst_input(skb);

drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}

ip_rcv_finish首先调用ip_route_input获取目的地路由,关于路由的部分放到以后说,这里通过本地路由表,会得知这个包究竟是应该本地接收还是给转发出去,ip_route_input会把路由信息存到 (struct dst_entry *)skb->_skb_dst 中,而这个dst_entry->input 的函数指针究竟指向ip_local_deliver还是ip_forward是在ip_route_input_slow里决定的(ip_route_input_slow由ip_route_input调用)

ip_route_input_slow中,先调用ip_mkroute_input,查看是否有转发路由表项,如果没有则返错表示是本地接收。ip_mkroute_input会调用__mkroute_input,里面会调用dst_alloc创建一个rtable,并设置rth->u.dst.input = ip_forward,代码段如下:

    rth = dst_alloc(&ipv4_dst_ops);
    if (!rth) {
        err = -ENOBUFS;
        goto cleanup;
    }
    atomic_set(&rth->u.dst.__refcnt, 1);
    rth->u.dst.flags= DST_HOST;
    if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
        rth->u.dst.flags |= DST_NOPOLICY;
    if (IN_DEV_CONF_GET(out_dev, NOXFRM))
        rth->u.dst.flags |= DST_NOXFRM;
    rth->fl.fl4_dst = daddr;
    rth->rt_dst = daddr;
    rth->fl.fl4_tos = tos;
    rth->fl.mark    = skb->mark;
    rth->fl.fl4_src = saddr;
    rth->rt_src = saddr;
    rth->rt_gateway = daddr;
    rth->rt_iif     =
        rth->fl.iif = in_dev->dev->ifindex;
    rth->u.dst.dev  = (out_dev)->dev;
    dev_hold(rth->u.dst.dev);
    rth->idev   = in_dev_get(rth->u.dst.dev);
    rth->fl.oif     = 0;
    rth->rt_spec_dst= spec_dst;
    rth->u.dst.input = ip_forward;
    rth->u.dst.output = ip_output;
    rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
    rt_set_nexthop(rth, res, itag);
    rth->rt_flags = flags;


如果是broadcast input, 或者local_input,会走进如下代码段:

local_input:
    rth = dst_alloc(&ipv4_dst_ops);
    if (!rth)
        goto e_nobufs;

    rth->u.dst.output= ip_rt_bug;
    rth->rt_genid = rt_genid(net);

    atomic_set(&rth->u.dst.__refcnt, 1);
    rth->u.dst.flags= DST_HOST;
    if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
        rth->u.dst.flags |= DST_NOPOLICY;
    rth->fl.fl4_dst = daddr;
    rth->rt_dst = daddr;
    rth->fl.fl4_tos = tos;
    rth->fl.mark    = skb->mark;
    rth->fl.fl4_src = saddr;
    rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
    rth->u.dst.tclassid = itag;
#endif
    rth->rt_iif =
    rth->fl.iif = dev->ifindex;
    rth->u.dst.dev  = net->loopback_dev;
    dev_hold(rth->u.dst.dev);
    rth->idev   = in_dev_get(rth->u.dst.dev);
    rth->rt_gateway = daddr;
    rth->rt_spec_dst= spec_dst;
    rth->u.dst.input= ip_local_deliver;
    rth->rt_flags   = flags|RTCF_LOCAL;
    if (res.type == RTN_UNREACHABLE) {
        rth->u.dst.input= ip_error;
        rth->u.dst.error= -err;
        rth->rt_flags   &= ~RTCF_LOCAL;
    }
    rth->rt_type    = res.type;
    hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
    err = rt_intern_hash(hash, rth, NULL, skb);
    goto done;