网络协议栈学习之socket, sock_common, sock, 和 sk_buff

时间:2022-02-24 02:11:49

  一直很好奇socket是如何实现的,底层的数据结构又是如何,因此在这里对socket的数据结构进行分析。
  socket是应用层使用的数据结构,用于声明、定义套接字,底层会调用sock结构体,其中sock会用到了sock_common结构体。而sk_buff则是内核中使用的套接字结构体,包括了网络层、IP层、MAC层各层的数据。在我们前文提到的NAT转换中,除了修改内核已有的Netfilter源码外,还有一种方式就是自己建立新的钩子函数,独立于Netfilter已有的NAT表之外,然后通过自己建立sk_buff的方式实现自制数据包或者修改已有数据包,完成NAT的类型转换(即实现通信)。下文会单独用一篇来说如何去自己建立sk_buff和钩子函数。
  这里我们看看源码中的数据结构。
  首先是socket数据结构,代码如下

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wq: wait queue for several uses
 */
struct socket {
    socket_state        state;

    short           type;

    unsigned long       flags;

    struct socket_wq __rcu  *wq;

    struct file     *file;
    struct sock     *sk;//这里就是下一层结构了
    const struct proto_ops  *ops;//套接字选项
};

  然后来看看sock结构体的代码

struct sock {
    /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */
    struct sock_common  __sk_common;//下面会说该结构体
#define sk_node __sk_common.skc_node
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping

#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
#define sk_hash __sk_common.skc_hash
#define sk_portpair __sk_common.skc_portpair
#define sk_num __sk_common.skc_num
#define sk_dport __sk_common.skc_dport
#define sk_addrpair __sk_common.skc_addrpair
#define sk_daddr __sk_common.skc_daddr
#define sk_rcv_saddr __sk_common.skc_rcv_saddr
#define sk_family __sk_common.skc_family
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
#define sk_net __sk_common.skc_net
#define sk_v6_daddr __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
#define sk_cookie __sk_common.skc_cookie
#define sk_incoming_cpu __sk_common.skc_incoming_cpu
#define sk_flags __sk_common.skc_flags
#define sk_rxhash __sk_common.skc_rxhash

    socket_lock_t       sk_lock;
    atomic_t        sk_drops;
    int         sk_rcvlowat;
    struct sk_buff_head sk_error_queue;//错误队列,用于重传
    struct sk_buff_head sk_receive_queue;//接受队列
    /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */
    struct {
        atomic_t    rmem_alloc;
        int     len;
        struct sk_buff  *head;
        struct sk_buff  *tail;
    } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

    int         sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
    unsigned int        sk_ll_usec;
    /* ===== mostly read cache line ===== */
    unsigned int        sk_napi_id;
#endif
    int         sk_rcvbuf;

    struct sk_filter __rcu  *sk_filter;
    union {
        struct socket_wq __rcu  *sk_wq;
        struct socket_wq    *sk_wq_raw;
    };
#ifdef CONFIG_XFRM
    struct xfrm_policy __rcu *sk_policy[2];
#endif
    struct dst_entry    *sk_rx_dst;
    struct dst_entry __rcu  *sk_dst_cache;
    atomic_t        sk_omem_alloc;
    int         sk_sndbuf;

    /* ===== cache line for TX ===== */
    int         sk_wmem_queued;
    refcount_t      sk_wmem_alloc;
    unsigned long       sk_tsq_flags;
    union {
        struct sk_buff  *sk_send_head;
        struct rb_root  tcp_rtx_queue;
    };
    struct sk_buff_head sk_write_queue;
    __s32           sk_peek_off;
    int         sk_write_pending;
    __u32           sk_dst_pending_confirm;
    u32         sk_pacing_status; /* see enum sk_pacing */
    long            sk_sndtimeo;
    struct timer_list   sk_timer;
    __u32           sk_priority;
    __u32           sk_mark;
    u32         sk_pacing_rate; /* bytes per second */
    u32         sk_max_pacing_rate;
    struct page_frag    sk_frag;
    netdev_features_t   sk_route_caps;
    netdev_features_t   sk_route_nocaps;
    int         sk_gso_type;
    unsigned int        sk_gso_max_size;
    gfp_t           sk_allocation;
    __u32           sk_txhash;

    /* * Because of non atomicity rules, all * changes are protected by socket lock. */
    unsigned int        __sk_flags_offset[0];
#ifdef __BIG_ENDIAN_BITFIELD
#define SK_FL_PROTO_SHIFT 16
#define SK_FL_PROTO_MASK 0x00ff0000

#define SK_FL_TYPE_SHIFT 0
#define SK_FL_TYPE_MASK 0x0000ffff
#else
#define SK_FL_PROTO_SHIFT 8
#define SK_FL_PROTO_MASK 0x0000ff00

#define SK_FL_TYPE_SHIFT 16
#define SK_FL_TYPE_MASK 0xffff0000
#endif

    unsigned int        sk_padding : 1,
                sk_kern_sock : 1,
                sk_no_check_tx : 1,
                sk_no_check_rx : 1,
                sk_userlocks : 4,
                sk_protocol  : 8,
                sk_type      : 16;
#define SK_PROTOCOL_MAX U8_MAX
    u16         sk_gso_max_segs;
    u8          sk_pacing_shift;
    unsigned long           sk_lingertime;
    struct proto        *sk_prot_creator;
    rwlock_t        sk_callback_lock;
    int         sk_err,
                sk_err_soft;
    u32         sk_ack_backlog;
    u32         sk_max_ack_backlog;
    kuid_t          sk_uid;
    struct pid      *sk_peer_pid;
    const struct cred   *sk_peer_cred;
    long            sk_rcvtimeo;
    ktime_t         sk_stamp;
    u16         sk_tsflags;
    u8          sk_shutdown;
    u32         sk_tskey;
    atomic_t        sk_zckey;
    struct socket       *sk_socket;
    void            *sk_user_data;
#ifdef CONFIG_SECURITY
    void            *sk_security;
#endif
    struct sock_cgroup_data sk_cgrp_data;
    struct mem_cgroup   *sk_memcg;
    void            (*sk_state_change)(struct sock *sk);
    void            (*sk_data_ready)(struct sock *sk);
    void            (*sk_write_space)(struct sock *sk);
    void            (*sk_error_report)(struct sock *sk);
    int         (*sk_backlog_rcv)(struct sock *sk,
                          struct sk_buff *skb);
    void                    (*sk_destruct)(struct sock *sk);
    struct sock_reuseport __rcu *sk_reuseport_cb;
    struct rcu_head     sk_rcu;
};

其中调用了sock_common

struct sock_common {
    /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned * address on 64bit arches : cf INET_MATCH() */
    union {
        __addrpair  skc_addrpair;
        struct {
            __be32  skc_daddr;//目的地址
            __be32  skc_rcv_saddr;//源地址
        };
    };
    union  {
        unsigned int    skc_hash;
        __u16       skc_u16hashes[2];
    };
    /* skc_dport && skc_num must be grouped as well */
    union {
        __portpair  skc_portpair;
        struct {
            __be16  skc_dport;//端口
            __u16   skc_num;
        };
    };

    unsigned short      skc_family;
    volatile unsigned char  skc_state;
    unsigned char       skc_reuse:4;
    unsigned char       skc_reuseport:1;
    unsigned char       skc_ipv6only:1;
    unsigned char       skc_net_refcnt:1;
    int         skc_bound_dev_if;
    union {
        struct hlist_node   skc_bind_node;
        struct hlist_node   skc_portaddr_node;
    };
    struct proto        *skc_prot;//使用协议
    possible_net_t      skc_net;

#if IS_ENABLED(CONFIG_IPV6)
    struct in6_addr     skc_v6_daddr;
    struct in6_addr     skc_v6_rcv_saddr;
#endif

    atomic64_t      skc_cookie;

    /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */
    union {
        unsigned long   skc_flags;
        struct sock *skc_listener; /* request_sock */
        struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
    };
    /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */
    /* private: */
    int         skc_dontcopy_begin[0];
    /* public: */
    union {
        struct hlist_node   skc_node;
        struct hlist_nulls_node skc_nulls_node;
    };
    int         skc_tx_queue_mapping;
    union {
        int     skc_incoming_cpu;
        u32     skc_rcv_wnd;
        u32     skc_tw_rcv_nxt; /* struct tcp_timewait_sock */
    };

    refcount_t      skc_refcnt;
    /* private: */
    int                     skc_dontcopy_end[0];
    union {
        u32     skc_rxhash;
        u32     skc_window_clamp;
        u32     skc_tw_snd_nxt; /* struct tcp_timewait_sock */
    };
    /* public: */
};

最后看看在内核中使用的sk_buff

struct sk_buff {
    union {
        struct {
            /* These two members must be first. 链表结构*/
            struct sk_buff      *next;
            struct sk_buff      *prev;

            union {
                struct net_device   *dev;
                /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */
                unsigned long       dev_scratch;
            };
        };
        struct rb_node  rbnode; /* used in netem & tcp stack 红黑树节点*/
    };
    struct sock     *sk;//sock结构体

    union {
        ktime_t     tstamp;//时间戳
        u64     skb_mstamp;
    };
    /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */
    char            cb[48] __aligned(8);

    union {
        struct {
            unsigned long   _skb_refdst;
            void        (*destructor)(struct sk_buff *skb);
        };
        struct list_head    tcp_tsorted_anchor;
    };

#ifdef CONFIG_XFRM
    struct  sec_path    *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
    unsigned long        _nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
    struct nf_bridge_info   *nf_bridge;
#endif
    unsigned int        len,
                data_len;
    __u16           mac_len,
                hdr_len;

    /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */
    __u16           queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)

    __u8            __cloned_offset[0];
    __u8            cloned:1,
                nohdr:1,
                fclone:2,
                peeked:1,
                head_frag:1,
                xmit_more:1,
                __unused:1; /* one bit hole */

    /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */
    /* private: */
    __u32           headers_start[0];
    /* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)

    __u8            __pkt_type_offset[0];
    __u8            pkt_type:3;
    __u8            pfmemalloc:1;
    __u8            ignore_df:1;

    __u8            nf_trace:1;
    __u8            ip_summed:2;
    __u8            ooo_okay:1;
    __u8            l4_hash:1;
    __u8            sw_hash:1;
    __u8            wifi_acked_valid:1;
    __u8            wifi_acked:1;

    __u8            no_fcs:1;
    /* Indicates the inner headers are valid in the skbuff. */
    __u8            encapsulation:1;
    __u8            encap_hdr_csum:1;
    __u8            csum_valid:1;
    __u8            csum_complete_sw:1;
    __u8            csum_level:2;
    __u8            csum_not_inet:1;

    __u8            dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
    __u8            ndisc_nodetype:2;
#endif
    __u8            ipvs_property:1;
    __u8            inner_protocol_type:1;
    __u8            remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
    __u8            offload_fwd_mark:1;
    __u8            offload_mr_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
    __u8            tc_skip_classify:1;
    __u8            tc_at_ingress:1;
    __u8            tc_redirected:1;
    __u8            tc_from_ingress:1;
#endif

#ifdef CONFIG_NET_SCHED
    __u16           tc_index;   /* traffic control index */
#endif

    union {
        __wsum      csum;
        struct {
            __u16   csum_start;
            __u16   csum_offset;
        };
    };
    __u32           priority;
    int         skb_iif;
    __u32           hash;
    __be16          vlan_proto;
    __u16           vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
    union {
        unsigned int    napi_id;
        unsigned int    sender_cpu;
    };
#endif
#ifdef CONFIG_NETWORK_SECMARK
    __u32       secmark;
#endif

    union {
        __u32       mark;
        __u32       reserved_tailroom;
    };

    union {
        __be16      inner_protocol;
        __u8        inner_ipproto;
    };

    __u16           inner_transport_header;
    __u16           inner_network_header;
    __u16           inner_mac_header;

    __be16          protocol;
    __u16           transport_header;
    __u16           network_header;
    __u16           mac_header;

    /* private: */
    __u32           headers_end[0];
    /* public: */

    /* These elements must be at the end, see alloc_skb() for details. */
    sk_buff_data_t      tail;
    sk_buff_data_t      end;
    unsigned char       *head,
                *data;
    unsigned int        truesize;
    refcount_t      users;
};