linux内核网络协议栈学习笔记（6）

本篇讨论IP包的收发（暂不包括路由）

先来看inet_init，

首先是调用proto_register，注册了tcp_prot, udp_prot, raw_prot，其中proto_register前半部分是初始化各种slab_cache，后半部分把这些struct proto结构链到proto_list里

其次调用sock_register，内核有一个全局的net_proto_family结构的net_families数组，inet_init调用sock_register就是把inet_family_ops加到net_families[PF_NET]中，inet_family_ops结构如下

static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};

接着调用inet_add_protocol，去填充inet_protos数组，inet_protos是一个全局的指针数组，其定义如下：

const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;

可以看出数组最大长度MAX_INET_PROTOS为256，in.h里对所有的协议做了定义

/* Standard well-defined IP protocols. */
enum {
IPPROTO_IP = 0, /* Dummy protocol for TCP */
IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
IPPROTO_TCP = 6, /* Transmission Control Protocol */
IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
IPPROTO_PUP = 12, /* PUP protocol */
IPPROTO_UDP = 17, /* User Datagram Protocol */
IPPROTO_IDP = 22, /* XNS IDP protocol */
IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
IPPROTO_RSVP = 46, /* RSVP protocol */
IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
IPPROTO_AH = 51, /* Authentication Header protocol */
IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
IPPROTO_PIM = 103, /* Protocol Independent Multicast */
IPPROTO_COMP = 108, /* Compression Header protocol */
IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
IPPROTO_RAW = 255, /* Raw IP packets */
IPPROTO_MAX
};

inet_init里对inet_protos里只定义了ICMP, IGMP, TCP, UDP，以TCP为例，其net_protocol定义为

static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};

IP层在把报文往上送的时候，e.g. ip_local_deliver_finish，实际上就是根据skb的protocol在inet_protos里找到对应的net_protocol结构，然后调用net_protocol->handler函数，e.g. 如果是TCP协议的skb，这时就调用tcp_v4_rcv

下面开始初始化inetsw数组以及inetsw_arry数组，inetsw是个list_head数组，每个索引代表了IP报的一种类型（由四层决定的），如 SOCK_STREAM, SOCK_DGRAM, SOCK_RAW等，定义如下

enum sock_type {
SOCK_STREAM = 1,
SOCK_DGRAM = 2,
SOCK_RAW = 3,
SOCK_RDM = 4,
SOCK_SEQPACKET = 5,
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};

inetsw_array数组是一个inet_protosw类型的数组，定义如下

static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},

{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},

{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};

而inet_protosw定义如下

/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */

可以看出inet_protosw的list就是inetsw指向的list_head指针

最后是分别调用 arp_init, ip_init, tcp_v4_init, tcp_init, udp_init 等，这里略过了

下面来谈IP协议，这里我们略过IP option部分，因为实际应用的网络几乎不会有IP option出现，先看IP头部

struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 tos;
__be16 tot_len;
__be16 id;
__be16 frag_off;
__u8 ttl;
__u8 protocol;
__sum16 check;
__be32 saddr;
__be32 daddr;
/*The options start here. */
};

ihl单位是4字节，一般而言ihl长度是20字节因此是这个值是5

tot_len单位是字节

id一般用于IP的分段/组合，同一IP包的所有分段其ID值是相同的

protocol表示4层协议值

check是IP首部的校检和

sk_buff 结构中，skb->csum保存了L4的校验和，skb->ip_summed表示校验和的状态

CHECKSUM_NONE，表示L4校验和无效，需要重新计算

CHECKSUM_HW，表示网卡已经正确计算了L4校验和，但程序需要再次验证L4校验和

CHECKSUM_UNNECESSARY，表示L4校验和无需验证

static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};

L2层通过ip_packet_type找到ip_rcv函数，从而把报文传到L3，下面分析下ip_rcv 函数：

/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;

IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}

if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;

iph = ip_hdr(skb);

如果skb是通过混杂模式获取的且不是发往本机的，直接丢弃；如果skb是share的，调用skb_share_check复制一份出来处理；pskb_may_pull这个函数比较复杂，其目的是，确保在skb->data开始的线性内存里面至少有 iphdr 的内容（这里要提下sk_buff这个结构的复杂性就在于：真正的报文内容很多情况下是不存在skb所在的线性内存中的，通常情况下，sk_buff后面会跟着一块线性内存空间，用skb_shared_info来表示，如果IP包没有分片的话，这里会存储scatter-gather的报文内容，这些内容是分散在各个不同的内存页中的，用一个 skb_frag_t 数组frags表示，nrfrags里保存了数组中元素的个数；如果IP包存在分片的话，可以看到有个sk_buff的数组frag_list，里面就是分片的skb咯），如果skb->data后续的内存不够，pskb_may_pull会扩充这个skb结构，然后把frags或者frag_list里的IP头内容拷出来填到skb线性内存里

if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;

if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;

iph = ip_hdr(skb);

if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;

len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;

这段代码基本都是做一些check，略过了

/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}

pskb_trim_rcsum用于去掉L2用来padding的部分，并重新计算checksum，了解下就行了

return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);

最后走一遍netfilter，如果不被DROP或啥的，进入ip_rcv_finish

static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;

/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
goto drop;
}
}

if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;

rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
skb->len);

return dst_input(skb);

drop:
kfree_skb(skb);
return NET_RX_DROP;
}

ip_rcv_finish首先调用ip_route_input获取目的地路由，关于路由的部分放到以后说，这里通过本地路由表，会得知这个包究竟是应该本地接收还是给转发出去，ip_route_input会把路由信息存到 (struct dst_entry *)skb->_skb_dst 中，而这个dst_entry->input 的函数指针究竟指向ip_local_deliver还是ip_forward是在ip_route_input_slow里决定的（ip_route_input_slow由ip_route_input调用）

ip_route_input_slow中，先调用ip_mkroute_input，查看是否有转发路由表项，如果没有则返错表示是本地接收。ip_mkroute_input会调用__mkroute_input，里面会调用dst_alloc创建一个rtable，并设置rth->u.dst.input = ip_forward，代码段如下：

rth = dst_alloc(&ipv4_dst_ops);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
rth->u.dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
rth->u.dst.dev = (out_dev)->dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
rt_set_nexthop(rth, res, itag);
rth->rt_flags = flags;

如果是broadcast input, 或者local_input，会走进如下代码段：

local_input:
rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
goto e_nobufs;

rth->u.dst.output= ip_rt_bug;
rth->rt_genid = rt_genid(net);

atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
rth->u.dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
rth->u.dst.dev = net->loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
rth->u.dst.input= ip_error;
rth->u.dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb);
goto done;

秒客网

linux内核网络协议栈学习笔记（6）

相关文章