sk_buff 是网络数据包的承载,是最关键的结构体之一
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @sk: Socket we are owned by
* @dev: Device we arrived on/are leaving by
* @cb: Control buffer. Free for use by every layer. Put private
vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming
should start
* @csum_offset: Offset from csum_start where checksum should be
stored
* @priority: Packet queueing priority
* @ignore_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @ip_summed: Driver fed us an IP checksum
* @nohdr: Payload reference only, must not modify header
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
* @tc_skip_classify: do not classify packet. set by IFB device
* @tc_at_ingress: used within tc_classify to distinguish in/egress
* @tc_redirected: packet was redirected by a tc action
* @tc_from_ingress: if tc_redirected, tc_at_ingress at time of
redirect
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see
br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @tc_index: Traffic control index
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
* @xmit_more: More SKBs are pending for this queue
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be
changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over
transport
* ports.
* @sw_hash: indicates hash was computed in software stack
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @dst_pending_confirm: need to confirm neighbour
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking
* @mark: Generic packet mark
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
* @inner_transport_header: Inner transport layer header
(encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @tail: Tail pointer
* @end: End pointer
* @head: Head of buffer
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
*/
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next; //用于形成链表
struct sk_buff *prev;
union {
ktime_t tstamp;
//标记包的时间戳,数据包出去或者进入的时候会被设置
struct skb_mstamp skb_mstamp;
};
};
struct rb_node rbnode; /* used in netem & tcp
stack */
};
struct sock *sk;
//对应于inet层的sock结构,只有当数据包进入到inet层的时候,该指针才会被初始化,如果是forward或者IP层的数据包
//不会被初始化
union {
struct net_device *dev;
//对应接收设备或者发送设备,接收的包指向接收设备,发送的包指向发送设备,它在随后的过程中会被修改,
//input_dev表示接收数据包的原始网络设备
/* Some protocols might use this space to store
information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
//私有数据块,数据包每到达一层都可以存放自己的私有数据进去,以增强可读性
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
//skb的析构函数,当该skb被释放时,调用该函数做一些扫尾的动作,比如释放数据缓存和sock
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct; //netfilter 连接跟踪
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info *nf_bridge;
#endif
unsigned int len, //skb数据部分的长度
data_len; //聚合分散IO存储区的数据长度
__u16 mac_len, //以太网首部长度
hdr_len;
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
kmemcheck_bitfield_begin(flags1);
__u16 queue_mapping;
/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff,
__cloned_offset)
__u8 __cloned_offset[0];
__u8 cloned:1, //该包释放被克隆
nohdr:1, //标示是否存在协议头部,如果不存在表示头部被引用
fclone:2, //当前的克隆状态,表示父skb还是子skb
peeked:1,
head_frag:1,
xmit_more:1,
__unused:1; /* one bit hole */
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff,
__pkt_type_offset)
__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
//表示帧类型,它是由目的mac决定的,比如PCKET_HOST表示发往本地的包,PACKET_BROADCAST表示广播包
__u8 pfmemalloc:1;
__u8 ignore_df:1; //忽略DF标示
__u8 nf_trace:1;
__u8 ip_summed:2;
//标记传输层校验和的状态,比如完成校验或者由硬件来完成校验等
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_bad:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
__u8 tc_redirected:1;
__u8 tc_from_ingress:1;
#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index
*/
#endif
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16
protocol;//在MAC层看到的上层协议类型,比如ARP,IP,IPv6,PPP等,netif_rx收到包以后,会在协议处理函数调用前被初
始化,以指明交给哪个协议处理
__u16 transport_header;
__u16 network_header;
__u16 mac_header; //
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for
details. */
sk_buff_data_t tail; //这几个用于指向不同的skb位置
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize; //整个数据缓存区的总长度
atomic_t
users;//引用计数器,用来标记有多少实体引用了该SKB,主要是用来确定skb的释放时机
};
sk_buff结构体其实内容不少,但是掌握其中各个字段的对后面的分析会很重要。
这里需要明确的是sk_buff存放在一块内存里面,而真正的数据存放在另外一块内存里面,sk_buff通过指针指向这块数据区,在数据区的紧接的尾部有一个skb_shared_info结构体,用于控制该数据的IP分片。
/*
This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
unsigned short _unused;
unsigned char nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
unsigned int gso_type;
u32 tskey;
__be32 ip6_frag_id;
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref;
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};
下图是sock,skb,skb_shinfo和数据缓存之间的关系,一个sock对应的是一个应用层的socket,它的收包队列由skb构成,skb通过alloc_skb分配在高速缓存里面。
skb通过指针指向真实数据,这些数据存放在内存里面,head指向头部,end指向尾部,同时end指向的位置也是skb_shinfo的起始位置
一个sock可能收到很多个包,所以每个包都串联在同一个链表里面
如果有开启聚合分散I/O分片的报文共享内存,则指针情况可能是这样的:</p>
下面是操作SKB的一些函数,他们像“瑞士军刀”一样实现对skb的快捷操作:</p>
下面是调用alloc_skb后,得到的skb结构如下图:
186 /**
187 * __alloc_skb - allocate a network buffer
188 * @size: size to allocate
189 * @gfp_mask: allocation mask
190 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
191 * instead of head cache and allocate a cloned (child) skb.
192 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
193 * allocations in case the data is required for writeback
194 * @node: numa node to allocate memory on
195 *
196 * Allocate a new &sk_buff. The returned buffer has no
headroom and a
197 * tail room of at least size bytes. The object has a reference
count
198 * of one. The return is the buffer. On a failure the return is
%NULL.
199 *
200 * Buffers may only be allocated from interrupts using a @gfp_mask
of
201 * %GFP_ATOMIC.
202 */
203 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
204 int flags, int node)
205 {
206 struct kmem_cache *cache;
207 struct skb_shared_info *shinfo;
208 struct sk_buff *skb;
209 u8 *data;
210 bool pfmemalloc;
211
212 cache = (flags & SKB_ALLOC_FCLONE)
213 ? skbuff_fclone_cache : skbuff_head_cache; //决定在哪个高速缓存分配
214
215 if (sk_memalloc_socks() && (flags &
SKB_ALLOC_RX))
216 gfp_mask |= __GFP_MEMALLOC;
217
218 /* Get the HEAD */
219 skb = kmem_cache_alloc_node(cache, gfp_mask &
~__GFP_DMA, node); //在高速缓存里面申请一块空间给SKB
220 if (!skb)
221 goto out;
222 prefetchw(skb);
224 /* We do our best to align skb_shared_info on a separate cache
225 * line. It usually works because kmalloc(X >
SMP_CACHE_BYTES) gives
226 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
227 * Both skb->head and skb_shared_info are cache line
aligned.
228 */
229 size = SKB_DATA_ALIGN(size); //数据区对其后的大小
230 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
//加上skb_shared_info后的大小
231 data = kmalloc_reserve(size, gfp_mask, node,
&pfmemalloc); //在内存中分配这块数据区
232 if (!data)
233 goto nodata;
234 /* kmalloc(size) might give us more room than requested.
235 * Put skb_shared_info exactly at the end of allocated zone,
236 * to allow max possible filling before reallocation.
237 */
238 size = SKB_WITH_OVERHEAD(ksize(data));
239 prefetchw(data + size);
240
241 /*
242 * Only clear those fields we need to clear, not those that we
will
243 * actually initialise below. Hence, don't put any more fields
after
244 * the tail pointer in struct sk_buff!
245 */
246 memset(skb, 0, offsetof(struct sk_buff, tail));
247 /* Account for allocated memory : skb + skb->head */
248 skb->truesize = SKB_TRUESIZE(size);
249 skb->pfmemalloc = pfmemalloc;
250 atomic_set(&skb->users, 1);
251 skb->head = data;
252 skb->data = data;
253 skb_reset_tail_pointer(skb);
254 skb->end = skb->tail + size;
255 skb->mac_header = (typeof(skb->mac_header))~0U;
256 skb->transport_header =
(typeof(skb->transport_header))~0U;
258 /* make sure we initialize shinfo sequentially */
259 shinfo = skb_shinfo(skb);
260 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
261 atomic_set(&shinfo->dataref, 1);
262 kmemcheck_annotate_variable(shinfo->destructor_arg);
263
264 if (flags & SKB_ALLOC_FCLONE) {
265 struct sk_buff_fclones *fclones;
266
267 fclones = container_of(skb, struct sk_buff_fclones, skb1);
268
269 kmemcheck_annotate_bitfield(&fclones->skb2,
flags1);
270 skb->fclone = SKB_FCLONE_ORIG;
271 atomic_set(&fclones->fclone_ref, 1);
272
273 fclones->skb2.fclone = SKB_FCLONE_CLONE;
274 }
275 out:
276 return skb;
277 nodata:
278 kmem_cache_free(cache, skb);
279 skb = NULL;
280 goto out;
281 }
282 EXPORT_SYMBOL(__alloc_skb);