Linux 协议栈分析 socket——笔记

时间:2022-08-31 04:18:10

转自:Linux.协议栈分析.socket

通过查看socket的帮助手册可以得到socket的定义形式为:

C

int socket(int domain, int type, int protocol);


domain的有效值如下:

 
       AF_UNIX, AF_LOCAL   Local communication              unix(7)
       AF_INET             IPv4 Internet protocols          ip(7)
       AF_INET6            IPv6 Internet protocols          ipv6(7)
       AF_IPX              IPX - Novell protocols
       AF_NETLINK          Kernel user interface device     netlink(7)
       AF_X25              ITU-T X.25 / ISO-8208 protocol   x25(7)
       AF_AX25             Amateur radio AX.25 protocol
       AF_ATMPVC           Access to raw ATM PVCs
       AF_APPLETALK        Appletalk                        ddp(7)
       AF_PACKET           Low level packet interface       packet(7)

而type的取值范围为:

 
       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based
                       byte  streams.  An out-of-band data transmission mecha‐
                       nism may be supported.
       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages
                       of a fixed maximum length).
       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-
                       based data transmission path  for  datagrams  of  fixed
                       maximum  length;  a  consumer  is  required  to read an
                       entire packet with each input system call.
       SOCK_RAW        Provides raw network protocol access.
       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐
                       antee ordering.
       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see
                       packet(7).

而在内核版本2.6.27之后,还可以通过设定相应二进制为1来设定socket的类型。即type可以在取上述值后再按位OR以下值。这一点可以在socket进入内核的源代码中得到证实。

 
       SOCK_NONBLOCK   Set  the  O_NONBLOCK  file  status flag on the new open
                       file description.  Using this flag saves extra calls to
                       fcntl(2) to achieve the same result.
       SOCK_CLOEXEC    Set the close-on-exec (FD_CLOEXEC) flag on the new file
                       descriptor.  See the description of the O_CLOEXEC  flag
                       in open(2) for reasons why this may be useful.

protocol一般为0。
socket函数经过前述的方式进入内核后会最终由sys_socket(net/socket.c)来完成。

C

1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    return -EINVAL;
  type &= SOCK_TYPE_MASK;
 
  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
  retval = sock_create(family, type, protocol, &sock);
  if (retval < 0)
    goto out;
 
  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
  if (retval < 0)
    goto out_release;
 
out:
  /* It may be already another descriptor 8) Not kernel problem. */
  return retval;
 
out_release:
  sock_release(sock);
  return retval;
}

1278~1281行就是取得type的值并检查是否合法。
我们知道socket对于用户的而言就是一个已经打开的特殊文件,而内核则为插口(socket)定义了一种特殊的文件类型形成特殊的文件系统sockfs(net/socket.c),而sys_socket中调用的两个函数sock_create和sock_map_fd,可以看到这两个函数都共用一个sock参数,这便是为内核管理socket用的,而sock_map_fd明显是为用户提供已经打开的文件号。
sockfs的建立过程省略,sockfs的定义如下:

C

301
302
303
304
305
306
307
static struct vfsmount *sock_mnt __read_mostly;
 
static struct file_system_type sock_fs_type = {
  .name =    "sockfs",
  .get_sb =  sockfs_get_sb,
  .kill_sb =  kill_anon_super,
};

而所谓的通过socket函数创建一个插口,就是在sockfs中创建一个特殊文件,或者说是一个结点,并为实现相应插口功能建立一起一整套数据结构。所以首先就通过sock_create创建一个struct socket数据结构,然后通过sock_map_fd映射到一个已经打开的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定义(include/linux/net.h):

C

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 */
struct socket {
  socket_state    state;
 
  kmemcheck_bitfield_begin(type);
  short      type;
  kmemcheck_bitfield_end(type);
 
  unsigned long    flags;
  /*
   * Please keep fasync_list & wait fields in the same cache line
   */
  struct fasync_struct  *fasync_list;
  wait_queue_head_t  wait;
 
  struct file    *file;
  struct sock    *sk;
  const struct proto_ops  *ops;
};

C

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
struct proto_ops {
  int    family;
  struct module  *owner;
  int    (*release)   (struct socket *sock);
  int    (*bind)       (struct socket *sock,
              struct sockaddr *myaddr,
              int sockaddr_len);
  int    (*connect)   (struct socket *sock,
              struct sockaddr *vaddr,
              int sockaddr_len, int flags);
  int    (*socketpair)(struct socket *sock1,
              struct socket *sock2);
  int    (*accept)    (struct socket *sock,
              struct socket *newsock, int flags);
  int    (*getname)   (struct socket *sock,
              struct sockaddr *addr,
              int *sockaddr_len, int peer);
  unsigned int  (*poll)       (struct file *file, struct socket *sock,
              struct poll_table_struct *wait);
  int    (*ioctl)     (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int    (*listen)    (struct socket *sock, int len);
  int    (*shutdown)  (struct socket *sock, int flags);
  int    (*setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*compat_setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*compat_getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len);
  int    (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len,
              int flags);
  int    (*mmap)       (struct file *file, struct socket *sock,
              struct vm_area_struct * vma);
  ssize_t    (*sendpage)  (struct socket *sock, struct page *page,
              int offset, size_t size, int flags);
  ssize_t   (*splice_read)(struct socket *sock,  loff_t *ppos,
               struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};

接下来分析sock_create(net/socket.c),sock_create会调用__sock_create。

C

1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
static int __sock_create(struct net *net, int family, int type, int protocol,
       struct socket **res, int kern)
{
  int err;
  struct socket *sock;
  const struct net_proto_family *pf;
 
  /*
   *      Check protocol is in range
   */
  if (family < 0 || family >= NPROTO)
    return -EAFNOSUPPORT;
  if (type < 0 || type >= SOCK_MAX)
    return -EINVAL;
 
  /* Compatibility.
 
     This uglymoron is moved from INET layer to here to avoid
     deadlock in module load.
   */
  if (family == PF_INET && type == SOCK_PACKET) {
    static int warned;
    if (!warned) {
      warned = 1;
      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
    }
    family = PF_PACKET;
  }
 
  err = security_socket_create(family, type, protocol, kern);
  if (err)
    return err;
 
  /*
   *  Allocate the socket and allow the family to set things up. if
   *  the protocol is 0, the family is instructed to select an appropriate
   *  default.
   */
  sock = sock_alloc();
  if (!sock) {
    if (net_ratelimit())
      printk(KERN_WARNING "socket: no more sockets\n");
    return -ENFILE;  /* Not exactly a match, but its the
           closest posix thing */
  }
 
  sock->type = type;
 
#ifdef CONFIG_MODULES
  /* Attempt to load a protocol module if the find failed.
   *
   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
   * requested real, full-featured networking support upon configuration.
   * Otherwise module support will break!
   */
  if (net_families[family] == NULL)
    request_module("net-pf-%d", family);
#endif
 
  rcu_read_lock();
  pf = rcu_dereference(net_families[family]);
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;
 
  /*
   * We will call the ->create function, that possibly is in a loadable
   * module, so we have to bump that loadable module refcnt first.
   */
  if (!try_module_get(pf->owner))
    goto out_release;
 
  /* Now protected by module ref count */
  rcu_read_unlock();
 
  err = pf->create(net, sock, protocol);
  if (err < 0)
    goto out_module_put;
 
  /*
   * Now to bump the refcnt of the [loadable] module that owns this
   * socket at sock_release time we decrement its refcnt.
   */
  if (!try_module_get(sock->ops->owner))
    goto out_module_busy;
 
  /*
   * Now that we're done with the ->create function, the [loadable]
   * module can have its refcnt decremented
   */
  module_put(pf->owner);
  err = security_socket_post_create(sock, family, type, protocol, kern);
  if (err)
    goto out_sock_release;
  *res = sock;
 
  return 0;
 
out_module_busy:
  err = -EAFNOSUPPORT;
out_module_put:
  sock->ops = NULL;
  module_put(pf->owner);
out_sock_release:
  sock_release(sock);
  return err;
 
out_release:
  rcu_read_unlock();
  goto out_sock_release;
}
 
int sock_create(int family, int type, int protocol, struct socket **res)
{
  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

1150~1171行做的很简单,不过是参数检查。
接下来的security_socket_create以及后面的security_socket_post_create都定义在/include/linux/security.h中定义的空函数

C

static inline int security_socket_create(int family, int type,
           int protocol, int kern)
{
  return 0;
}
static inline int security_socket_post_create(struct socket *sock,
                int family,
                int type,
                int protocol, int kern)
{
  return 0;
}

1182行的sock_alloc的代码如下:

C

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
static struct socket *sock_alloc(void)
{
  struct inode *inode;
  struct socket *sock;
 
  inode = new_inode(sock_mnt->mnt_sb);
  if (!inode)
    return NULL;
 
  sock = SOCKET_I(inode);
 
  kmemcheck_annotate_bitfield(sock, type);
  inode->i_mode = S_IFSOCK | S_IRWXUGO;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
 
  percpu_add(sockets_in_use, 1);
  return sock;
}

其中的new_inode是在/fs/inode.c中定义

C

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
static struct inode *alloc_inode(struct super_block *sb)
{
  struct inode *inode;
 
  if (sb->s_op->alloc_inode)
    inode = sb->s_op->alloc_inode(sb);
  else
    inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
  if (!inode)
    return NULL;
 
  if (unlikely(inode_init_always(sb, inode))) {
    if (inode->i_sb->s_op->destroy_inode)
      inode->i_sb->s_op->destroy_inode(inode);
    else
      kmem_cache_free(inode_cachep, inode);
    return NULL;
  }
 
  return inode;
}

C

659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
struct inode *new_inode(struct super_block *sb)
{
  /*
   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
   * error if st_ino won't fit in target struct field. Use 32bit counter
   * here to attempt to avoid that.
   */
  static unsigned int last_ino;
  struct inode *inode;
 
  spin_lock_prefetch(&inode_lock);
 
  inode = alloc_inode(sb);
  if (inode) {
    spin_lock(&inode_lock);
    __inode_add_to_lists(sb, NULL, inode);
    inode->i_ino = ++last_ino;
    inode->i_state = 0;
    spin_unlock(&inode_lock);
  }
  return inode;
}
EXPORT_SYMBOL(new_inode);

可以看出new_inode会调用alloc_inode分配inode,而alloc_inode会调用sockfs在VFS中注册的相应的函数来处理,那这个函数是什么呢?先来看一看/net/socket.c

C

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
static struct inode *sock_alloc_inode(struct super_block *sb)
{
  struct socket_alloc *ei;
 
  ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
  if (!ei)
    return NULL;
  init_waitqueue_head(&ei->socket.wait);
 
  ei->socket.fasync_list = NULL;
  ei->socket.state = SS_UNCONNECTED;
  ei->socket.flags = 0;
  ei->socket.ops = NULL;
  ei->socket.sk = NULL;
  ei->socket.file = NULL;
 
  return &ei->vfs_inode;
}

C

287
288
289
290
291
static const struct super_operations sockfs_ops = {
  .alloc_inode =  sock_alloc_inode,
  .destroy_inode =sock_destroy_inode,
  .statfs =  simple_statfs,
};

为帮助理解列出struct socket_alloc 结构体的定义。

C

794
795
796
797
798
799
800
801
802
struct socket_alloc {
  struct socket socket;
  struct inode vfs_inode;
};
 
static inline struct socket *SOCKET_I(struct inode *inode)
{
  return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

可以看到这个函数其实就是sock_alloc_inode,该函数分配了一个struct socket_alloc类型的结构体,然后返回这个结构体中的一个成员变量vfs_inode的地址,可以看出来这就是一个inode结构。然后就回到了sock_alloc函数的第489行,通过SOCKET_I获得与vfs_inode同在socket_alloc结构体中的成员socket的地址。然后程序返回到__sock_create的1190行。

1192开始的代码说明,如果编译内核开启了CONFIG_MODULES也就是内核模块的选项就先检查内核现在是否有支持由family(就是domain)所指定的网域的代码,如果没有则通过request_module来安装。

说到这里就先看看1204行的net_families这个数组,很明显它是控制和操作各个网域的一个控制结构体的集合,通过变量pf可以发现它的类型为struct net_proto_family(/include/linux/net.h)

C

201
202
203
204
205
struct net_proto_family {
  int    family;
  int    (*create)(struct net *net, struct socket *sock, int protocol);
  struct module  *owner;
};

然后1219行通过pf调用相应网域的create的函数,可以很简单地得出对于AF_UNIX, AF_INET, AF_INET6, AF_PACKET这些所对应的create函数肯定不一样。接下来我们以AF_INET为例说明。在/net/ipv4/af_inet.c中

C

934
935
936
937
938
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};

由936可以得出对于AF_inet其create函数为inet_create,定义于同一文件中。

C

265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
static int inet_create(struct net *net, struct socket *sock, int protocol)
{
  struct sock *sk;
  struct inet_protosw *answer;
  struct inet_sock *inet;
  struct proto *answer_prot;
  unsigned char answer_flags;
  char answer_no_check;
  int try_loading_module = 0;
  int err;
 
  if (unlikely(!inet_ehash_secret))
    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
      build_ehash_secret();
 
  sock->state = SS_UNCONNECTED;
 
  /* Look for the requested type/protocol pair. */
lookup_protocol:
  err = -ESOCKTNOSUPPORT;
  rcu_read_lock();
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) {
      if (protocol != IPPROTO_IP)
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) {
        protocol = answer->protocol;
        break;
      }
      if (IPPROTO_IP == answer->protocol)
        break;
    }
    err = -EPROTONOSUPPORT;
  }
 
  if (unlikely(err)) {
    if (try_loading_module < 2) {
      rcu_read_unlock();
      /*
       * Be more specific, e.g. net-pf-2-proto-132-type-1
       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
       */
      if (++try_loading_module == 1)
        request_module("net-pf-%d-proto-%d-type-%d",
                 PF_INET, protocol, sock->type);
      /*
       * Fall back to generic, e.g. net-pf-2-proto-132
       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
       */
      else
        request_module("net-pf-%d-proto-%d",
                 PF_INET, protocol);
      goto lookup_protocol;
    } else
      goto out_rcu_unlock;
  }
 
  err = -EPERM;
  if (answer->capability > 0 && !capable(answer->capability))
    goto out_rcu_unlock;
 
  err = -EAFNOSUPPORT;
  if (!inet_netns_ok(net, protocol))
    goto out_rcu_unlock;
 
  sock->ops = answer->ops;
  answer_prot = answer->prot;
  answer_no_check = answer->no_check;
  answer_flags = answer->flags;
  rcu_read_unlock();
 
  WARN_ON(answer_prot->slab == NULL);
 
  err = -ENOBUFS;
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;
 
  err = 0;
  sk->sk_no_check = answer_no_check;
  if (INET_PROTOSW_REUSE & answer_flags)
    sk->sk_reuse = 1;
 
  inet = inet_sk(sk);
  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
  if (SOCK_RAW == sock->type) {
    inet->num = protocol;
    if (IPPROTO_RAW == protocol)
      inet->hdrincl = 1;
  }
 
  if (ipv4_config.no_pmtu_disc)
    inet->pmtudisc = IP_PMTUDISC_DONT;
  else
    inet->pmtudisc = IP_PMTUDISC_WANT;
 
  inet->id = 0;
 
  sock_init_data(sock, sk);
 
  sk->sk_destruct     = inet_sock_destruct;
  sk->sk_protocol     = protocol;
  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
  inet->uc_ttl  = -1;
  inet->mc_loop  = 1;
  inet->mc_ttl  = 1;
  inet->mc_all  = 1;
  inet->mc_index  = 0;
  inet->mc_list  = NULL;
 
  sk_refcnt_debug_inc(sk);
 
  if (inet->num) {
    /* It assumes that any protocol which allows
     * the user to assign a number at socket
     * creation time automatically
     * shares.
     */
    inet->sport = htons(inet->num);
    /* Add to protocol hash chains. */
    sk->sk_prot->hash(sk);
  }
 
  if (sk->sk_prot->init) {
    err = sk->sk_prot->init(sk);
    if (err)
      sk_common_release(sk);
  }
out:
  return err;
out_rcu_unlock:
  rcu_read_unlock();
  goto out;
}

每283到325就是通过type和protocol从inetsw中找出对应的struct inet_protosw的结构体。inetsw是定义于(net/ipv4/af_inet.c)中定义的

C

120
121
122
123
124
/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

而对于struct inet_protosw是在/include/net/protocol.h中定义

C

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
  struct list_head list;
 
        /* These two fields form the lookup key.  */
  unsigned short   type;     /* This is the 2nd argument to socket(2). */
  unsigned short   protocol; /* This is the L4 protocol number.  */
 
  struct proto   *prot;
  const struct proto_ops *ops;
 
  int              capability; /* Which (if any) capability do
              * we need to use this socket
              * interface?
                                      */
  char             no_check;   /* checksum on rcv/xmit/none? */
  unsigned char   flags;      /* See INET_PROTOSW_* below.  */
};

inetsw其实是就是Linux内核的典型的组织链表结构的一个数组,是按type组织的。inetsw是通过inet_register_protosw初始化的

C

980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
void inet_register_protosw(struct inet_protosw *p)
{
  struct list_head *lh;
  struct inet_protosw *answer;
  int protocol = p->protocol;
  struct list_head *last_perm;
 
  spin_lock_bh(&inetsw_lock);
 
  if (p->type >= SOCK_MAX)
    goto out_illegal;
 
  /* If we are trying to override a permanent protocol, bail. */
  answer = NULL;
  last_perm = &inetsw[p->type];
  list_for_each(lh, &inetsw[p->type]) {
    answer = list_entry(lh, struct inet_protosw, list);
 
    /* Check only the non-wild match. */
    if (INET_PROTOSW_PERMANENT & answer->flags) {
      if (protocol == answer->protocol)
        break;
      last_perm = lh;
    }
 
    answer = NULL;
  }
  if (answer)
    goto out_permanent;
 
  /* Add the new entry after the last permanent entry if any, so that
   * the new entry does not override a permanent entry when matched with
   * a wild-card protocol. But it is allowed to override any existing
   * non-permanent entry.  This means that when we remove this entry, the
   * system automatically returns to the old behavior.
   */
  list_add_rcu(&p->list, last_perm);
out:
  spin_unlock_bh(&inetsw_lock);
 
  return;
 
out_permanent:
  printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
         protocol);
  goto out;
 
out_illegal:
  printk(KERN_ERR
         "Ignoring attempt to register invalid socket type %d.\n",
         p->type);
  goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

对于inet_register_protosw的调用是在inet_init中的第1593行进行的。

C

1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
static int __init inet_init(void)
{
  struct sk_buff *dummy_skb;
  struct inet_protosw *q;
  struct list_head *r;
  int rc = -EINVAL;
 
  BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
  rc = proto_register(&tcp_prot, 1);
  if (rc)
    goto out;
 
  rc = proto_register(&udp_prot, 1);
  if (rc)
    goto out_unregister_tcp_proto;
 
  rc = proto_register(&raw_prot, 1);
  if (rc)
    goto out_unregister_udp_proto;
 
  /*
   *  Tell SOCKET that we are alive...
   */
 
  (void)sock_register(&inet_family_ops);
 
#ifdef CONFIG_SYSCTL
  ip_static_sysctl_init();
#endif
 
  /*
   *  Add all the base protocols.
   */
 
  if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
  if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
  if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
#ifdef CONFIG_IP_MULTICAST
  if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
#endif
 
  /* Register the socket-side information for inet_create. */
  for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    INIT_LIST_HEAD(r);
 
  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    inet_register_protosw(q);
 
  /*
   *  Set the ARP module up
   */
 
  arp_init();
 
  /*
   *  Set the IP module up
   */
 
  ip_init();
 
  tcp_v4_init();
 
  /* Setup TCP slab cache for open requests. */
  tcp_init();
 
  /* Setup UDP memory threshold */
  udp_init();
 
  /* Add UDP-Lite (RFC 3828) */
  udplite4_register();
 
  /*
   *  Set the ICMP layer up
   */
 
  if (icmp_init() < 0)
    panic("Failed to create the ICMP control socket.\n");
 
  /*
   *  Initialise the multicast router
   */
#if defined(CONFIG_IP_MROUTE)
  if (ip_mr_init())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
  /*
   *  Initialise per-cpu ipv4 mibs
   */
 
  if (init_ipv4_mibs())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
 
  ipv4_proc_init();
 
  ipfrag_init();
 
  dev_add_pack(&ip_packet_type);
 
  rc = 0;
out:
  return rc;
out_unregister_udp_proto:
  proto_unregister(&udp_prot);
out_unregister_tcp_proto:
  proto_unregister(&tcp_prot);
  goto out;
}
 
fs_initcall(inet_init);

从1592行可以看出初始化inetsw是用的inetsw_array数组,再看看inetsw_array数组。

C

852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
const struct proto_ops inet_stream_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_stream_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = inet_accept,
  .getname     = inet_getname,
  .poll       = tcp_poll,
  .ioctl       = inet_ioctl,
  .listen       = inet_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = tcp_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = tcp_sendpage,
  .splice_read     = tcp_splice_read,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
 
const struct proto_ops inet_dgram_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = udp_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
 
/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = datagram_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
 
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};
 
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
  {
    .type =       SOCK_STREAM,
    .protocol =   IPPROTO_TCP,
    .prot =       &tcp_prot,
    .ops =        &inet_stream_ops,
    .capability = -1,
    .no_check =   0,
    .flags =      INET_PROTOSW_PERMANENT |
            INET_PROTOSW_ICSK,
  },
 
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_UDP,
    .prot =       &udp_prot,
    .ops =        &inet_dgram_ops,
    .capability = -1,
    .no_check =   UDP_CSUM_DEFAULT,
    .flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
         .type =       SOCK_RAW,
         .protocol =   IPPROTO_IP,  /* wild card */
         .prot =       &raw_prot,
         .ops =        &inet_sockraw_ops,
         .capability = CAP_NET_RAW,
         .no_check =   UDP_CSUM_DEFAULT,
         .flags =      INET_PROTOSW_REUSE,
       }
};
 
#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

假设我们分析ipv4中的TCP协议,其它协议也可以参照分析。现在回到inet_create函数,这个函数最重要的一行就是335,这一行的作用就是初始化套接口socket所应该对应的操作函数。例如如果用socket(AF_INET, SOCK_STREAM, 0);创建套接字,则内核就会在这里为这个套接字关联上相应的TCP的操作函数集inet_stream_ops,以后在这个套接字上的数据的各种操作如accept listen bind send recv都会通过这些函数完成。
接下来在inet_create中的344后就是分配一个struct sock结构体,这个sock结构和socket结构是一一对应的,两个结构各有一个成员指向对方。struct sock是在include/net/sock.h中定义,它有两个非常重要的成员sk_receive_queue和sk_write_queue。还有两个成员sk_rcvbuf,sk_sndbuf分别代表接收和发送缓冲区的大小,默认是32767字节,是在sock_init_data(net/core/sock.c)中初始化的。另外对于有连接模式可能要求超时重传,所以还有一个sk_timer的定时队列。

C

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
/**
  *  struct sock - network layer representation of sockets
  *  @__sk_common: shared layout with inet_timewait_sock
  *  @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *  @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *  @sk_lock:  synchronizer
  *  @sk_rcvbuf: size of receive buffer in bytes
  *  @sk_sleep: sock wait queue
  *  @sk_dst_cache: destination cache
  *  @sk_dst_lock: destination cache lock
  *  @sk_policy: flow policy
  *  @sk_rmem_alloc: receive queue bytes committed
  *  @sk_receive_queue: incoming packets
  *  @sk_wmem_alloc: transmit queue bytes committed
  *  @sk_write_queue: Packet sending queue
  *  @sk_async_wait_queue: DMA copied packets
  *  @sk_omem_alloc: "o" is "option" or "other"
  *  @sk_wmem_queued: persistent queue size
  *  @sk_forward_alloc: space allocated forward
  *  @sk_allocation: allocation mode
  *  @sk_sndbuf: size of send buffer in bytes
  *  @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *       %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *  @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
  *  @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *  @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *  @sk_gso_max_size: Maximum GSO segment size to build
  *  @sk_lingertime: %SO_LINGER l_linger setting
  *  @sk_backlog: always used with the per-socket spinlock held
  *  @sk_callback_lock: used with the callbacks in the end of this struct
  *  @sk_error_queue: rarely used
  *  @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *        IPV6_ADDRFORM for instance)
  *  @sk_err: last error
  *  @sk_err_soft: errors that don't cause failure but are the cause of a
  *          persistent failure not just 'timed out'
  *  @sk_drops: raw/udp drops counter
  *  @sk_ack_backlog: current listen backlog
  *  @sk_max_ack_backlog: listen backlog set in listen()
  *  @sk_priority: %SO_PRIORITY setting
  *  @sk_type: socket type (%SOCK_STREAM, etc)
  *  @sk_protocol: which protocol this socket belongs in this network family
  *  @sk_peercred: %SO_PEERCRED setting
  *  @sk_rcvlowat: %SO_RCVLOWAT setting
  *  @sk_rcvtimeo: %SO_RCVTIMEO setting
  *  @sk_sndtimeo: %SO_SNDTIMEO setting
  *  @sk_filter: socket filtering instructions
  *  @sk_protinfo: private area, net family specific, when not using slab
  *  @sk_timer: sock cleanup timer
  *  @sk_stamp: time stamp of last packet received
  *  @sk_socket: Identd and reporting IO signals
  *  @sk_user_data: RPC layer private data
  *  @sk_sndmsg_page: cached page for sendmsg
  *  @sk_sndmsg_off: cached offset for sendmsg
  *  @sk_send_head: front of stuff to transmit
  *  @sk_security: used by security modules
  *  @sk_mark: generic packet mark
  *  @sk_write_pending: a write to stream socket waits to start
  *  @sk_state_change: callback to indicate change in the state of the sock
  *  @sk_data_ready: callback to indicate there is data to be processed
  *  @sk_write_space: callback to indicate there is bf sending space available
  *  @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *  @sk_backlog_rcv: callback to process the backlog
  *  @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
 */
struct sock {
  /*
   * Now struct inet_timewait_sock also uses sock_common, so please just
   * don't add nothing before this first member (__sk_common) --acme
   */
  struct sock_common  __sk_common;
#define sk_node      __sk_common.skc_node
#define sk_nulls_node    __sk_common.skc_nulls_node
#define sk_refcnt    __sk_common.skc_refcnt
 
#define sk_copy_start    __sk_common.skc_hash
#define sk_hash      __sk_common.skc_hash
#define sk_family    __sk_common.skc_family
#define sk_state    __sk_common.skc_state
#define sk_reuse    __sk_common.skc_reuse
#define sk_bound_dev_if    __sk_common.skc_bound_dev_if
#define sk_bind_node    __sk_common.skc_bind_node
#define sk_prot      __sk_common.skc_prot
#define sk_net      __sk_common.skc_net
  kmemcheck_bitfield_begin(flags);
  unsigned int    sk_shutdown  : 2,
        sk_no_check  : 2,
        sk_userlocks : 4,
        sk_protocol  : 8,
        sk_type      : 16;
  kmemcheck_bitfield_end(flags);
  int      sk_rcvbuf;
  socket_lock_t    sk_lock;
  /*
   * The backlog queue is special, it is always used with
   * the per-socket spinlock held and requires low latency
   * access. Therefore we special case it's implementation.
   */
  struct {
    struct sk_buff *head;
    struct sk_buff *tail;
  } sk_backlog;
  wait_queue_head_t  *sk_sleep;
  struct dst_entry  *sk_dst_cache;
#ifdef CONFIG_XFRM
  struct xfrm_policy  *sk_policy[2];
#endif
  rwlock_t    sk_dst_lock;
  atomic_t    sk_rmem_alloc;
  atomic_t    sk_wmem_alloc;
  atomic_t    sk_omem_alloc;
  int      sk_sndbuf;
  struct sk_buff_head  sk_receive_queue;
  struct sk_buff_head  sk_write_queue;
#ifdef CONFIG_NET_DMA
  struct sk_buff_head  sk_async_wait_queue;
#endif
  int      sk_wmem_queued;
  int      sk_forward_alloc;
  gfp_t      sk_allocation;
  int      sk_route_caps;
  int      sk_gso_type;
  unsigned int    sk_gso_max_size;
  int      sk_rcvlowat;
  unsigned long     sk_flags;
  unsigned long          sk_lingertime;
  struct sk_buff_head  sk_error_queue;
  struct proto    *sk_prot_creator;
  rwlock_t    sk_callback_lock;
  int      sk_err,
        sk_err_soft;
  atomic_t    sk_drops;
  unsigned short    sk_ack_backlog;
  unsigned short    sk_max_ack_backlog;
  __u32      sk_priority;
  struct ucred    sk_peercred;
  long      sk_rcvtimeo;
  long      sk_sndtimeo;
  struct sk_filter        *sk_filter;
  void      *sk_protinfo;
  struct timer_list  sk_timer;
  ktime_t      sk_stamp;
  struct socket    *sk_socket;
  void      *sk_user_data;
  struct page    *sk_sndmsg_page;
  struct sk_buff    *sk_send_head;
  __u32      sk_sndmsg_off;
  int      sk_write_pending;
#ifdef CONFIG_SECURITY
  void      *sk_security;
#endif
  __u32      sk_mark;
  /* XXX 4 bytes hole on 64 bit */
  void      (*sk_state_change)(struct sock *sk);
  void      (*sk_data_ready)(struct sock *sk, int bytes);
  void      (*sk_write_space)(struct sock *sk);
  void      (*sk_error_report)(struct sock *sk);
    int      (*sk_backlog_rcv)(struct sock *sk,
              struct sk_buff *skb);
  void                    (*sk_destruct)(struct sock *sk);
};

在分析sk_alloc之前先分析一下answer_prot. answer_prot是struct proto类型(include/net/sock.h)

C

606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 * transport -> network interface is defined by struct inet_proto
 */
struct proto {
  void      (*close)(struct sock *sk,
          long timeout);
  int      (*connect)(struct sock *sk,
                struct sockaddr *uaddr,
          int addr_len);
  int      (*disconnect)(struct sock *sk, int flags);
 
  struct sock *    (*accept) (struct sock *sk, int flags, int *err);
 
  int      (*ioctl)(struct sock *sk, int cmd,
           unsigned long arg);
  int      (*init)(struct sock *sk);
  void      (*destroy)(struct sock *sk);
  void      (*shutdown)(struct sock *sk, int how);
  int      (*setsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*getsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          int __user *option);
#ifdef CONFIG_COMPAT
  int      (*compat_setsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*compat_getsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          int __user *option);
#endif
  int      (*sendmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg, size_t len);
  int      (*recvmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg,
          size_t len, int noblock, int flags,
          int *addr_len);
  int      (*sendpage)(struct sock *sk, struct page *page,
          int offset, size_t size, int flags);
  int      (*bind)(struct sock *sk,
          struct sockaddr *uaddr, int addr_len);
 
  int      (*backlog_rcv) (struct sock *sk,
            struct sk_buff *skb);
 
  /* Keeping track of sk's, looking them up, and port selection methods. */
  void      (*hash)(struct sock *sk);
  void      (*unhash)(struct sock *sk);
  int      (*get_port)(struct sock *sk, unsigned short snum);
 
  /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
  unsigned int    inuse_idx;
#endif
 
  /* Memory pressure */
  void      (*enter_memory_pressure)(struct sock *sk);
  atomic_t    *memory_allocated;  /* Current allocated memory. */
  struct percpu_counter  *sockets_allocated;  /* Current number of sockets. */
  /*
   * Pressure flag: try to collapse.
   * Technical note: it is used by multiple contexts non atomically.
   * All the __sk_mem_schedule() is of this nature: accounting
   * is strict, actions are advisory and have some latency.
   */
  int      *memory_pressure;
  int      *sysctl_mem;
  int      *sysctl_wmem;
  int      *sysctl_rmem;
  int      max_header;
 
  struct kmem_cache  *slab;
  unsigned int    obj_size;
  int      slab_flags;
 
  struct percpu_counter  *orphan_count;
 
  struct request_sock_ops  *rsk_prot;
  struct timewait_sock_ops *twsk_prot;
 
  union {
    struct inet_hashinfo  *hashinfo;
    struct udp_table  *udp_table;
    struct raw_hashinfo  *raw_hash;
  } h;
 
  struct module    *owner;
 
  char      name[32];
 
  struct list_head  node;
#ifdef SOCK_REFCNT_DEBUG
  atomic_t    socks;
#endif
};

假设分析的是TCP协议,则通过336行的赋值从inetsw_array找到其prot成员变量为tcp_prot(net/ipv4/tcp_ipv4.h)。

C

2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
struct proto tcp_prot = {
  .name      = "TCP",
  .owner      = THIS_MODULE,
  .close      = tcp_close,
  .connect    = tcp_v4_connect,
  .disconnect    = tcp_disconnect,
  .accept      = inet_csk_accept,
  .ioctl      = tcp_ioctl,
  .init      = tcp_v4_init_sock,
  .destroy    = tcp_v4_destroy_sock,
  .shutdown    = tcp_shutdown,
  .setsockopt    = tcp_setsockopt,
  .getsockopt    = tcp_getsockopt,
  .recvmsg    = tcp_recvmsg,
  .backlog_rcv    = tcp_v4_do_rcv,
  .hash      = inet_hash,
  .unhash      = inet_unhash,
  .get_port    = inet_csk_get_port,
  .enter_memory_pressure  = tcp_enter_memory_pressure,
  .sockets_allocated  = &tcp_sockets_allocated,
  .orphan_count    = &tcp_orphan_count,
  .memory_allocated  = &tcp_memory_allocated,
  .memory_pressure  = &tcp_memory_pressure,
  .sysctl_mem    = sysctl_tcp_mem,
  .sysctl_wmem    = sysctl_tcp_wmem,
  .sysctl_rmem    = sysctl_tcp_rmem,
  .max_header    = MAX_TCP_HEADER,
  .obj_size    = sizeof(struct tcp_sock),
  .slab_flags    = SLAB_DESTROY_BY_RCU,
  .twsk_prot    = &tcp_timewait_sock_ops,
  .rsk_prot    = &tcp_request_sock_ops,
  .h.hashinfo    = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
  .compat_setsockopt  = compat_tcp_setsockopt,
  .compat_getsockopt  = compat_tcp_getsockopt,
#endif
};

通过tcp_prot的结构体对各成员的赋值可以发现并没有初始化,而obj_size被初始化为sizeof(struct tcp_sock)这一点可以在后面的分析中看到。接下来看inet_create的344行,即sk_alloc(net/ipv4/af_inet.c)。

C

951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
    int family)
{
  struct sock *sk;
  struct kmem_cache *slab;
 
  slab = prot->slab;
  if (slab != NULL) {
    sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
    if (!sk)
      return sk;
    if (priority & __GFP_ZERO) {
      /*
       * caches using SLAB_DESTROY_BY_RCU should let
       * sk_node.next un-modified. Special care is taken
       * when initializing object to zero.
       */
      if (offsetof(struct sock, sk_node.next) != 0)
        memset(sk, 0, offsetof(struct sock, sk_node.next));
      memset(&sk->sk_node.pprev, 0,
             prot->obj_size - offsetof(struct sock,
               sk_node.pprev));
    }
  }
  else
    sk = kmalloc(prot->obj_size, priority);
 
  if (sk != NULL) {
    kmemcheck_annotate_bitfield(sk, flags);
 
    if (security_sk_alloc(sk, family, priority))
      goto out_free;
 
    if (!try_module_get(prot->owner))
      goto out_free_sec;
  }
 
  return sk;
 
out_free_sec:
  security_sk_free(sk);
out_free:
  if (slab != NULL)
    kmem_cache_free(slab, sk);
  else
    kfree(sk);
  return NULL;
}

C

1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
/**
 *  sk_alloc - All socket objects are allocated here
 *  @net: the applicable net namespace
 *  @family: protocol family
 *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *  @prot: struct proto associated with this new sock instance
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
          struct proto *prot)
{
  struct sock *sk;
 
  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
  if (sk) {
    sk->sk_family = family;
    /*
     * See comment in struct sock definition to understand
     * why we need sk_prot_creator -acme
     */
    sk->sk_prot = sk->sk_prot_creator = prot;
    sock_lock_init(sk);
    sock_net_set(sk, get_net(net));
    atomic_set(&sk->sk_wmem_alloc, 1);
  }
 
  return sk;
}
EXPORT_SYMBOL(sk_alloc);

很明显在sk_alloc中直接调用sk_prot_alloc来分配sock结构,在sk_prot_alloc中先判定slab是否为空(如前提示),由于tcp_prot并未初始化slab所以直接分配obj_size大小即sizeof(struct tcp_sock)的空间,并返回空间类型为struct sock *的地址,但是又可以看到该空间的大小为sizeof(struct tcp_sock),那就说明有两种情况:一、sizeof(struct tcp_sock) == sizeof(struct sock) 二、sizeof(struct tcp_sock) >= sizeof(struct sock) 。通过分析实际是第二种情况,通过列出一系列数据结构可以很明显地看出。
先来看struct tcp_sock结构的定义(include/linux/tcp.h)

C

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
struct tcp_sock {
  /* inet_connection_sock has to be the first member of tcp_sock */
  struct inet_connection_sock  inet_conn;
  u16  tcp_header_len;  /* Bytes of tcp header to send    */
  u16  xmit_size_goal_segs; /* Goal for segmenting output packets */
 
/*
 *  Header prediction flags
 *  0x5?10 << 16 + snd_wnd in net byte order
 */
  __be32  pred_flags;
 
/*
 *  RFC793 variables by their proper names. This means you can
 *  read the code and the spec side by side (and laugh ...)
 *  See RFC793 and RFC1122. The RFC writes these in capitals.
 */
   u32  rcv_nxt;  /* What we want to receive next   */
  u32  copied_seq;  /* Head of yet unread data    */
  u32  rcv_wup;  /* rcv_nxt on last window update sent  */
   u32  snd_nxt;  /* Next sequence we send    */
 
   u32  snd_una;  /* First byte we want an ack for  */
   u32  snd_sml;  /* Last byte of the most recently transmitted small packet */
  u32  rcv_tstamp;  /* timestamp of last received ACK (for keepalives) */
  u32  lsndtime;  /* timestamp of last sent data packet (for restart window) */
 
  /* Data for direct copy to user */
  struct {
    struct sk_buff_head  prequeue;
    struct task_struct  *task;
    struct iovec    *iov;
    int      memory;
    int      len;
#ifdef CONFIG_NET_DMA
    /* members for async copy */
    struct dma_chan    *dma_chan;
    int      wakeup;
    struct dma_pinned_list  *pinned_list;
    dma_cookie_t    dma_cookie;
#endif
  } ucopy;
 
  u32  snd_wl1;  /* Sequence for window update    */
  u32  snd_wnd;  /* The window we expect to receive  */
  u32  max_window;  /* Maximal window ever seen from peer  */
  u32  mss_cache;  /* Cached effective mss, not including SACKS */
 
  u32  window_clamp;  /* Maximal window to advertise    */
  u32  rcv_ssthresh;  /* Current window clamp      */
 
  u32  frto_highmark;  /* snd_nxt when RTO occurred */
  u16  advmss;    /* Advertised MSS      */
  u8  frto_counter;  /* Number of new acks after RTO */
  u8  nonagle;  /* Disable Nagle algorithm?             */
 
/* RTT measurement */
  u32  srtt;    /* smoothed round trip time << 3  */
  u32  mdev;    /* medium deviation      */
  u32  mdev_max;  /* maximal mdev for the last rtt period  */
  u32  rttvar;    /* smoothed mdev_max      */
  u32  rtt_seq;  /* sequence number to update rttvar  */
 
  u32  packets_out;  /* Packets which are "in flight"  */
  u32  retrans_out;  /* Retransmitted packets out    */
 
  u16  urg_data;  /* Saved octet of OOB data and control flags */
  u8  ecn_flags;  /* ECN status bits.      */
  u8  reordering;  /* Packet reordering metric.    */
  u32  snd_up;    /* Urgent pointer    */
 
  u8  keepalive_probes; /* num of allowed keep alive probes  */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
  struct tcp_options_received rx_opt;
 
/*
 *  Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
   u32  snd_ssthresh;  /* Slow start size threshold    */
   u32  snd_cwnd;  /* Sending congestion window    */
  u32  snd_cwnd_cnt;  /* Linear increase counter    */
  u32  snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
  u32  snd_cwnd_used;
  u32  snd_cwnd_stamp;
 
   u32  rcv_wnd;  /* Current receiver window    */
  u32  write_seq;  /* Tail(+1) of data held in tcp send buffer */
  u32  pushed_seq;  /* Last pushed seq, required to talk to windows */
  u32  lost_out;  /* Lost packets      */
  u32  sacked_out;  /* SACK'd packets      */
  u32  fackets_out;  /* FACK'd packets      */
  u32  tso_deferred;
  u32  bytes_acked;  /* Appropriate Byte Counting - RFC3465 */
 
  /* from STCP, retrans queue hinting */
  struct sk_buff* lost_skb_hint;
  struct sk_buff *scoreboard_skb_hint;
  struct sk_buff *retransmit_skb_hint;
 
  struct sk_buff_head  out_of_order_queue; /* Out of order segments go here */
 
  /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */
  struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
  struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
  struct tcp_sack_block recv_sack_cache[4];
 
  struct sk_buff *highest_sack;   /* highest skb with SACK received
           * (validity guaranteed only if
           * sacked_out > 0)
           */
 
  int     lost_cnt_hint;
  u32     retransmit_high;  /* L-bits may be on up to this seqno */
 
  u32  lost_retrans_low;  /* Sent seq after any rxmit (lowest) */
 
  u32  prior_ssthresh; /* ssthresh saved at recovery start  */
  u32  high_seq;  /* snd_nxt at onset of congestion  */
 
  u32  retrans_stamp;  /* Timestamp of the last retransmit,
         * also used in SYN-SENT to remember stamp of
         * the first SYN. */
  u32  undo_marker;  /* tracking retrans started here. */
  int  undo_retrans;  /* number of undoable retransmissions. */
  u32  total_retrans;  /* Total retransmits for entire connection */
 
  u32  urg_seq;  /* Seq of received urgent pointer */
  unsigned int    keepalive_time;    /* time before keep alive takes place */
  unsigned int    keepalive_intvl;  /* time interval between keep alive probes */
 
  int      linger2;
 
/* Receiver side RTT estimation */
  struct {
    u32  rtt;
    u32  seq;
    u32  time;
  } rcv_rtt_est;
 
/* Receiver queue space */
  struct {
    int  space;
    u32  seq;
    u32  time;
  } rcvq_space;
 
/* TCP-specific MTU probe information. */
  struct {
    u32      probe_seq_start;
    u32      probe_seq_end;
  } mtu_probe;
 
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
  const struct tcp_sock_af_ops  *af_specific;
 
/* TCP MD5 Signature Option information */
  struct tcp_md5sig_info  *md5sig_info;
#endif
};

在tcp_sock的结构体的第一个成员变量类型为struct inet_connection_sock(include/net/inet_connection_sock.h)

C

67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:     FIFO of established children
 * @icsk_bind_hash:     Bind node
 * @icsk_timeout:     Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:       Retransmit timeout
 * @icsk_pmtu_cookie     Last pmtu seen by socket
 * @icsk_ca_ops       Pluggable congestion control hook
 * @icsk_af_ops       Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:     Congestion control state
 * @icsk_retransmits:     Number of unrecovered [RTO] timeouts
 * @icsk_pending:     Scheduled timer event
 * @icsk_backoff:     Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:     unanswered 0 window probes
 * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:       Delayed ACK control data
 * @icsk_mtup;       MTU probing control data
 */
struct inet_connection_sock {
  /* inet_sock has to be the first member! */
  struct inet_sock    icsk_inet;
  struct request_sock_queue icsk_accept_queue;
  struct inet_bind_bucket    *icsk_bind_hash;
  unsigned long      icsk_timeout;
   struct timer_list    icsk_retransmit_timer;
   struct timer_list    icsk_delack_timer;
  __u32        icsk_rto;
  __u32        icsk_pmtu_cookie;
  const struct tcp_congestion_ops *icsk_ca_ops;
  const struct inet_connection_sock_af_ops *icsk_af_ops;
  unsigned int      (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
  __u8        icsk_ca_state;
  __u8        icsk_retransmits;
  __u8        icsk_pending;
  __u8        icsk_backoff;
  __u8        icsk_syn_retries;
  __u8        icsk_probes_out;
  __u16        icsk_ext_hdr_len;
  struct {
    __u8      pending;   /* ACK is pending         */
    __u8      quick;   /* Scheduled number of quick acks     */
    __u8      pingpong;   /* The session is interactive       */
    __u8      blocked;   /* Delayed ACK was blocked by socket lock */
    __u32      ato;     /* Predicted tick of soft clock     */
    unsigned long    timeout;   /* Currently scheduled timeout       */
    __u32      lrcvtime;   /* timestamp of last received data packet */
    __u16      last_seg_size; /* Size of last incoming segment     */
    __u16      rcv_mss;   /* MSS used for delayed ACK decisions     */
  } icsk_ack;
  struct {
    int      enabled;
 
    /* Range of MTUs to search */
    int      search_high;
    int      search_low;
 
    /* Information on the current probe. */
    int      probe_size;
  } icsk_mtup;
  u32        icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE  (16 * sizeof(u32))
};

在 inet_connection_sock结构体中第一个成员变量类型为struct inet_sock(include/net/inet_sock.h)

C

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @daddr - Foreign IPv4 addr
 * @rcv_saddr - Bound local IPv4 addr
 * @dport - Destination port
 * @num - Local port
 * @saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @sport - Source port
 * @id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
  /* sk and pinet6 has to be the first two members of inet_sock */
  struct sock    sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  struct ipv6_pinfo  *pinet6;
#endif
  /* Socket demultiplex comparisons on incoming packets. */
  __be32      daddr;
  __be32      rcv_saddr;
  __be16      dport;
  __u16      num;
  __be32      saddr;
  __s16      uc_ttl;
  __u16      cmsg_flags;
  struct ip_options  *opt;
  __be16      sport;
  __u16      id;
  __u8      tos;
  __u8      mc_ttl;
  __u8      pmtudisc;
  __u8      recverr:1,
        is_icsk:1,
        freebind:1,
        hdrincl:1,
        mc_loop:1,
        transparent:1,
        mc_all:1;
  int      mc_index;
  __be32      mc_addr;
  struct ip_mc_socklist  *mc_list;
  struct {
    unsigned int    flags;
    unsigned int    fragsize;
    struct ip_options  *opt;
    struct dst_entry  *dst;
    int      length; /* Total length of all frames */
    __be32      addr;
    struct flowi    fl;
  } cork;
};

而inet_sock的第一个成员正是struct sock类型,所以sk_prot_alloc直接返回struct sock *类型指针是没有问题的,接下来执行inet_create中的353行用inet_sk通过sk获得inet指针的值,inet_sk函数其实就相当于强制类型转换,返回的就是sk的指针。
接下来程序就一路返回到__sock_create,接着再返回到sys_socket中。在sys_socket中调用了最后一个函数sock_map_fd(net/socket.c,将socket指针sock与一个已经打开的文件号关联起来返回给用户程序。

C

335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/*
 *  Obtains the first available file descriptor and sets it up for use.
 *
 *  These functions create file structures and maps them to fd space
 *  of the current process. On success it returns file descriptor
 *  and file struct implicitly stored in sock->file.
 *  Note that another thread may close file descriptor before we return
 *  from this function. We use the fact that now we do not refer
 *  to socket after mapping. If one day we will need it, this
 *  function will increment ref. count on file by 1.
 *
 *  In any case returned fd MAY BE not valid!
 *  This race condition is unavoidable
 *  with shared fd spaces, we cannot solve it inside kernel,
 *  but we take care of internal coherence yet.
 */
 
static int sock_alloc_fd(struct file **filep, int flags)
{
  int fd;
 
  fd = get_unused_fd_flags(flags);
  if (likely(fd >= 0)) {
    struct file *file = get_empty_filp();
 
    *filep = file;
    if (unlikely(!file)) {
      put_unused_fd(fd);
      return -ENFILE;
    }
  } else
    *filep = NULL;
  return fd;
}
 
static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
{
  struct dentry *dentry;
  struct qstr name = { .name = "" };
 
  dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
  if (unlikely(!dentry))
    return -ENOMEM;
 
  dentry->d_op = &sockfs_dentry_operations;
  /*
   * We dont want to push this dentry into global dentry hash table.
   * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
   * This permits a working /proc/$pid/fd/XXX on sockets
   */
  dentry->d_flags &= ~DCACHE_UNHASHED;
  d_instantiate(dentry, SOCK_INODE(sock));
 
  sock->file = file;
  init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
      &socket_file_ops);
  SOCK_INODE(sock)->i_fop = &socket_file_ops;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->f_pos = 0;
  file->private_data = sock;
 
  return 0;
}
 
int sock_map_fd(struct socket *sock, int flags)
{
  struct file *newfile;
  int fd = sock_alloc_fd(&newfile, flags);
 
  if (likely(fd >= 0)) {
    int err = sock_attach_fd(sock, newfile, flags);
 
    if (unlikely(err < 0)) {
      put_filp(newfile);
      put_unused_fd(fd);
      return err;
    }
    fd_install(fd, newfile);
  }
  return fd;
}

fs/dcache.c

C

982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
/* the caller must hold dcache_lock */
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
  if (inode)
    list_add(&dentry->d_alias, &inode->i_dentry);
  dentry->d_inode = inode;
  fsnotify_d_instantiate(dentry, inode);
}
 
/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
  BUG_ON(!list_empty(&entry->d_alias));
  spin_lock(&dcache_lock);
  __d_instantiate(entry, inode);
  spin_unlock(&dcache_lock);
  security_d_instantiate(entry, inode);
}

/net/socket.c

C

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
 *  Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *  in the operation structures but are done directly via the socketcall() multiplexor.
 */
 
static const struct file_operations socket_file_ops = {
  .owner =  THIS_MODULE,
  .llseek =  no_llseek,
  .aio_read =  sock_aio_read,
  .aio_write =  sock_aio_write,
  .poll =    sock_poll,
  .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
  .compat_ioctl = compat_sock_ioctl,
#endif
  .mmap =    sock_mmap,
  .open =    sock_no_open,  /* special open code to disallow open via /proc */
  .release =  sock_close,
  .fasync =  sock_fasync,
  .sendpage =  sock_sendpage,
  .splice_write = generic_splice_sendpage,
  .splice_read =  sock_splice_read,
};

在sock_map_fd中先通过402行获得一个未用的已经打开的文件号以及file结构,然后通过405行调用sock_attach_fd将文件号与sock相关联起来,在sock_attach_fd中先通地375行从sockfs中分配一个dentry,其中sock_mnt就是在描述sockfs中提到的,d_instantiate的作用就是将dentry与socket的inode关联起来,然后388行又将sock->file与file关联起来。389~390行将socket文件上的操作初始化为socket_file_ops。这样,通过send/recv进入内核将调用inet_stream_ops中的函数,而通过read/write调用将调用socket_file_ops中的函数。然后反回至sys_socket函数中,再经过系统调用切换到用户态,socket函数的整个调用过程完成。