作者:姚开健
原创作品转载请注明出处
《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
进程的描述
Linux系统的进程由一个进程描述符PCB,即task_struct结构体来描述,其在内核中代码实现如下:
struct task_struct {代码很长,这里不一一分析其结构,可参考网上的其他相关文章。我们可以从代码知道一般的进程信息包括进程状态,进程调度信息,进程标识符,进程通信有关信息,进程链接信息,时间和定时器信息,文件系统信息,虚拟内存信息,页面管理信息,对称处理机信息,和处理器相关的上下文信息等。如下图简略图所示:
1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1237 void *stack;
1238 atomic_t usage;
1239 unsigned int flags; /* per process flags, defined below */
1240 unsigned int ptrace;
1241
1242#ifdef CONFIG_SMP
1243 struct llist_node wake_entry;
1244 int on_cpu;
1245 struct task_struct *last_wakee;
1246 unsigned long wakee_flips;
1247 unsigned long wakee_flip_decay_ts;
1248
1249 int wake_cpu;
1250#endif
1251 int on_rq;
1252
1253 int prio, static_prio, normal_prio;
1254 unsigned int rt_priority;
1255 const struct sched_class *sched_class;
1256 struct sched_entity se;
1257 struct sched_rt_entity rt;
1258#ifdef CONFIG_CGROUP_SCHED
1259 struct task_group *sched_task_group;
1260#endif
1261 struct sched_dl_entity dl;
1262
1263#ifdef CONFIG_PREEMPT_NOTIFIERS
1264 /* list of struct preempt_notifier: */
1265 struct hlist_head preempt_notifiers;
1266#endif
1267
1268#ifdef CONFIG_BLK_DEV_IO_TRACE
1269 unsigned int btrace_seq;
1270#endif
1271
1272 unsigned int policy;
1273 int nr_cpus_allowed;
1274 cpumask_t cpus_allowed;
1275
1276#ifdef CONFIG_PREEMPT_RCU
1277 int rcu_read_lock_nesting;
1278 union rcu_special rcu_read_unlock_special;
1279 struct list_head rcu_node_entry;
1280#endif /* #ifdef CONFIG_PREEMPT_RCU */
1281#ifdef CONFIG_TREE_PREEMPT_RCU
1282 struct rcu_node *rcu_blocked_node;
1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1284#ifdef CONFIG_TASKS_RCU
1285 unsigned long rcu_tasks_nvcsw;
1286 bool rcu_tasks_holdout;
1287 struct list_head rcu_tasks_holdout_list;
1288 int rcu_tasks_idle_cpu;
1289#endif /* #ifdef CONFIG_TASKS_RCU */
1290
1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1292 struct sched_info sched_info;
1293#endif
1294
1295 struct list_head tasks;
1296#ifdef CONFIG_SMP
1297 struct plist_node pushable_tasks;
1298 struct rb_node pushable_dl_tasks;
1299#endif
1300
1301 struct mm_struct *mm, *active_mm;
1302#ifdef CONFIG_COMPAT_BRK
1303 unsigned brk_randomized:1;
1304#endif
1305 /* per-thread vma caching */
1306 u32 vmacache_seqnum;
1307 struct vm_area_struct *vmacache[VMACACHE_SIZE];
1308#if defined(SPLIT_RSS_COUNTING)
1309 struct task_rss_stat rss_stat;
1310#endif
1311/* task state */
1312 int exit_state;
1313 int exit_code, exit_signal;
1314 int pdeath_signal; /* The signal sent when the parent dies */
1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */
1316
1317 /* Used for emulating ABI behavior of previous Linux versions */
1318 unsigned int personality;
1319
1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an
1321 * execve */
1322 unsigned in_iowait:1;
1323
1324 /* Revert to default priority/policy when forking */
1325 unsigned sched_reset_on_fork:1;
1326 unsigned sched_contributes_to_load:1;
1327
1328 unsigned long atomic_flags; /* Flags needing atomic access. */
1329
1330 pid_t pid;
1331 pid_t tgid;
1332
1333#ifdef CONFIG_CC_STACKPROTECTOR
1334 /* Canary value for the -fstack-protector gcc feature */
1335 unsigned long stack_canary;
1336#endif
1337 /*
1338 * pointers to (original) parent process, youngest child, younger sibling,
1339 * older sibling, respectively. (p->father can be replaced with
1340 * p->real_parent->pid)
1341 */
1342 struct task_struct __rcu *real_parent; /* real parent process */
1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1344 /*
1345 * children/sibling forms the list of my natural children
1346 */
1347 struct list_head children; /* list of my children */
1348 struct list_head sibling; /* linkage in my parent's children list */
1349 struct task_struct *group_leader; /* threadgroup leader */
1350
1351 /*
1352 * ptraced is the list of tasks this task is using ptrace on.
1353 * This includes both natural children and PTRACE_ATTACH targets.
1354 * p->ptrace_entry is p's link on the p->parent->ptraced list.
1355 */
1356 struct list_head ptraced;
1357 struct list_head ptrace_entry;
1358
1359 /* PID/PID hash table linkage. */
1360 struct pid_link pids[PIDTYPE_MAX];
1361 struct list_head thread_group;
1362 struct list_head thread_node;
1363
1364 struct completion *vfork_done; /* for vfork() */
1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1367
1368 cputime_t utime, stime, utimescaled, stimescaled;
1369 cputime_t gtime;
1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1371 struct cputime prev_cputime;
1372#endif
1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1374 seqlock_t vtime_seqlock;
1375 unsigned long long vtime_snap;
1376 enum {
1377 VTIME_SLEEPING = 0,
1378 VTIME_USER,
1379 VTIME_SYS,
1380 } vtime_snap_whence;
1381#endif
1382 unsigned long nvcsw, nivcsw; /* context switch counts */
1383 u64 start_time; /* monotonic time in nsec */
1384 u64 real_start_time; /* boot based time in nsec */
1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1386 unsigned long min_flt, maj_flt;
1387
1388 struct task_cputime cputime_expires;
1389 struct list_head cpu_timers[3];
1390
1391/* process credentials */
1392 const struct cred __rcu *real_cred; /* objective and real subjective task
1393 * credentials (COW) */
1394 const struct cred __rcu *cred; /* effective (overridable) subjective task
1395 * credentials (COW) */
1396 char comm[TASK_COMM_LEN]; /* executable name excluding path
1397 - access with [gs]et_task_comm (which lock
1398 it with task_lock())
1399 - initialized normally by setup_new_exec */
1400/* file system info */
1401 int link_count, total_link_count;
1402#ifdef CONFIG_SYSVIPC
1403/* ipc stuff */
1404 struct sysv_sem sysvsem;
1405 struct sysv_shm sysvshm;
1406#endif
1407#ifdef CONFIG_DETECT_HUNG_TASK
1408/* hung task detection */
1409 unsigned long last_switch_count;
1410#endif
1411/* CPU-specific state of this task */
1412 struct thread_struct thread;
1413/* filesystem information */
1414 struct fs_struct *fs;
1415/* open file information */
1416 struct files_struct *files;
1417/* namespaces */
1418 struct nsproxy *nsproxy;
1419/* signal handlers */
1420 struct signal_struct *signal;
1421 struct sighand_struct *sighand;
1422
1423 sigset_t blocked, real_blocked;
1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
1425 struct sigpending pending;
1426
1427 unsigned long sas_ss_sp;
1428 size_t sas_ss_size;
1429 int (*notifier)(void *priv);
1430 void *notifier_data;
1431 sigset_t *notifier_mask;
1432 struct callback_head *task_works;
1433
1434 struct audit_context *audit_context;
1435#ifdef CONFIG_AUDITSYSCALL
1436 kuid_t loginuid;
1437 unsigned int sessionid;
1438#endif
1439 struct seccomp seccomp;
1440
1441/* Thread group tracking */
1442 u32 parent_exec_id;
1443 u32 self_exec_id;
1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1445 * mempolicy */
1446 spinlock_t alloc_lock;
1447
1448 /* Protection of the PI data structures: */
1449 raw_spinlock_t pi_lock;
1450
1451#ifdef CONFIG_RT_MUTEXES
1452 /* PI waiters blocked on a rt_mutex held by this task */
1453 struct rb_root pi_waiters;
1454 struct rb_node *pi_waiters_leftmost;
1455 /* Deadlock detection and priority inheritance handling */
1456 struct rt_mutex_waiter *pi_blocked_on;
1457#endif
1458
1459#ifdef CONFIG_DEBUG_MUTEXES
1460 /* mutex deadlock detection */
1461 struct mutex_waiter *blocked_on;
1462#endif
1463#ifdef CONFIG_TRACE_IRQFLAGS
1464 unsigned int irq_events;
1465 unsigned long hardirq_enable_ip;
1466 unsigned long hardirq_disable_ip;
1467 unsigned int hardirq_enable_event;
1468 unsigned int hardirq_disable_event;
1469 int hardirqs_enabled;
1470 int hardirq_context;
1471 unsigned long softirq_disable_ip;
1472 unsigned long softirq_enable_ip;
1473 unsigned int softirq_disable_event;
1474 unsigned int softirq_enable_event;
1475 int softirqs_enabled;
1476 int softirq_context;
1477#endif
1478#ifdef CONFIG_LOCKDEP
1479# define MAX_LOCK_DEPTH 48UL
1480 u64 curr_chain_key;
1481 int lockdep_depth;
1482 unsigned int lockdep_recursion;
1483 struct held_lock held_locks[MAX_LOCK_DEPTH];
1484 gfp_t lockdep_reclaim_gfp;
1485#endif
1486
1487/* journalling filesystem info */
1488 void *journal_info;
1489
1490/* stacked block device info */
1491 struct bio_list *bio_list;
1492
1493#ifdef CONFIG_BLOCK
1494/* stack plugging */
1495 struct blk_plug *plug;
1496#endif
1497
1498/* VM state */
1499 struct reclaim_state *reclaim_state;
1500
1501 struct backing_dev_info *backing_dev_info;
1502
1503 struct io_context *io_context;
1504
1505 unsigned long ptrace_message;
1506 siginfo_t *last_siginfo; /* For ptrace use. */
1507 struct task_io_accounting ioac;
1508#if defined(CONFIG_TASK_XACCT)
1509 u64 acct_rss_mem1; /* accumulated rss usage */
1510 u64 acct_vm_mem1; /* accumulated virtual memory usage */
1511 cputime_t acct_timexpd; /* stime + utime since last update */
1512#endif
1513#ifdef CONFIG_CPUSETS
1514 nodemask_t mems_allowed; /* Protected by alloc_lock */
1515 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
1516 int cpuset_mem_spread_rotor;
1517 int cpuset_slab_spread_rotor;
1518#endif
1519#ifdef CONFIG_CGROUPS
1520 /* Control Group info protected by css_set_lock */
1521 struct css_set __rcu *cgroups;
1522 /* cg_list protected by css_set_lock and tsk->alloc_lock */
1523 struct list_head cg_list;
1524#endif
1525#ifdef CONFIG_FUTEX
1526 struct robust_list_head __user *robust_list;
1527#ifdef CONFIG_COMPAT
1528 struct compat_robust_list_head __user *compat_robust_list;
1529#endif
1530 struct list_head pi_state_list;
1531 struct futex_pi_state *pi_state_cache;
1532#endif
1533#ifdef CONFIG_PERF_EVENTS
1534 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1535 struct mutex perf_event_mutex;
1536 struct list_head perf_event_list;
1537#endif
1538#ifdef CONFIG_DEBUG_PREEMPT
1539 unsigned long preempt_disable_ip;
1540#endif
1541#ifdef CONFIG_NUMA
1542 struct mempolicy *mempolicy; /* Protected by alloc_lock */
1543 short il_next;
1544 short pref_node_fork;
1545#endif
1546#ifdef CONFIG_NUMA_BALANCING
1547 int numa_scan_seq;
1548 unsigned int numa_scan_period;
1549 unsigned int numa_scan_period_max;
1550 int numa_preferred_nid;
1551 unsigned long numa_migrate_retry;
1552 u64 node_stamp; /* migration stamp */
1553 u64 last_task_numa_placement;
1554 u64 last_sum_exec_runtime;
1555 struct callback_head numa_work;
1556
1557 struct list_head numa_entry;
1558 struct numa_group *numa_group;
1559
1560 /*
1561 * Exponential decaying average of faults on a per-node basis.
1562 * Scheduling placement decisions are made based on the these counts.
1563 * The values remain static for the duration of a PTE scan
1564 */
1565 unsigned long *numa_faults_memory;
1566 unsigned long total_numa_faults;
1567
1568 /*
1569 * numa_faults_buffer records faults per node during the current
1570 * scan window. When the scan completes, the counts in
1571 * numa_faults_memory decay and these values are copied.
1572 */
1573 unsigned long *numa_faults_buffer_memory;
1574
1575 /*
1576 * Track the nodes the process was running on when a NUMA hinting
1577 * fault was incurred.
1578 */
1579 unsigned long *numa_faults_cpu;
1580 unsigned long *numa_faults_buffer_cpu;
1581
1582 /*
1583 * numa_faults_locality tracks if faults recorded during the last
1584 * scan window were remote/local. The task scan period is adapted
1585 * based on the locality of the faults with different weights
1586 * depending on whether they were shared or private faults
1587 */
1588 unsigned long numa_faults_locality[2];
1589
1590 unsigned long numa_pages_migrated;
1591#endif /* CONFIG_NUMA_BALANCING */
1592
1593 struct rcu_head rcu;
1594
1595 /*
1596 * cache last used pipe for splice
1597 */
1598 struct pipe_inode_info *splice_pipe;
1599
1600 struct page_frag task_frag;
1601
1602#ifdef CONFIG_TASK_DELAY_ACCT
1603 struct task_delay_info *delays;
1604#endif
1605#ifdef CONFIG_FAULT_INJECTION
1606 int make_it_fail;
1607#endif
1608 /*
1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
1610 * balance_dirty_pages() for some dirty throttling pause
1611 */
1612 int nr_dirtied;
1613 int nr_dirtied_pause;
1614 unsigned long dirty_paused_when; /* start of a write-and-pause period */
1615
1616#ifdef CONFIG_LATENCYTOP
1617 int latency_record_count;
1618 struct latency_record latency_record[LT_SAVECOUNT];
1619#endif
1620 /*
1621 * time slack values; these are used to round up poll() and
1622 * select() etc timeout values. These are in nanoseconds.
1623 */
1624 unsigned long timer_slack_ns;
1625 unsigned long default_timer_slack_ns;
1626
1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1628 /* Index of current stored address in ret_stack */
1629 int curr_ret_stack;
1630 /* Stack of return addresses for return function tracing */
1631 struct ftrace_ret_stack *ret_stack;
1632 /* time stamp for last schedule */
1633 unsigned long long ftrace_timestamp;
1634 /*
1635 * Number of functions that haven't been traced
1636 * because of depth overrun.
1637 */
1638 atomic_t trace_overrun;
1639 /* Pause for the tracing */
1640 atomic_t tracing_graph_pause;
1641#endif
1642#ifdef CONFIG_TRACING
1643 /* state flags for use by tracers */
1644 unsigned long trace;
1645 /* bitmask and counter of trace recursion */
1646 unsigned long trace_recursion;
1647#endif /* CONFIG_TRACING */
1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
1649 unsigned int memcg_kmem_skip_account;
1650 struct memcg_oom_info {
1651 struct mem_cgroup *memcg;
1652 gfp_t gfp_mask;
1653 int order;
1654 unsigned int may_oom:1;
1655 } memcg_oom;
1656#endif
1657#ifdef CONFIG_UPROBES
1658 struct uprobe_task *utask;
1659#endif
1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1661 unsigned int sequential_io;
1662 unsigned int sequential_io_avg;
1663#endif
1664};
进程的创建
当说明了进程的描述之后,来分析一下进程的创建过程。
创建进程的系统调用有fork(),vfork()和clone()这三个。fork和vfork的区别在于fork需要拷贝父进程的内核数据空间,而vfork在exec与exit之前与父进程共用数据空间,fork创建了子进程后不限定父进程与子进程的执行顺序,而vfork需要在子进程exec与exit之前让父进程阻塞,子进程先执行。clone只要是对简单的进程进行创建。三个系统调用都是调用do_fork()来进行进程的创建。
long do_fork(unsigned long clone_flags,真正执行进程创建的是copy_process函数调用,完成子进程对父进程的PCB的复制与修改并初始化。接着执行 调用dup_task_struct()为新进程创建一个内核栈
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
...
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
...
}
p = dup_task_struct(current);
retval = copy_thread(clone_flags, stack_start, stack_size, p);
复制父进程堆栈的内容到子进程的堆栈中去.这其中,copy_thread()函数中的语句p->thread.ip = (unsigned long) ret_from_fork;
决定了新进程的第一条指令地址
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
int node = tsk_fork_get_node(orig);
int err;
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
ti = alloc_thread_info_node(tsk, node);
if (!ti)
goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
goto free_ti;
tsk->stack = ti;
# ifdef CONFIG_SECCOMP
tsk->seccomp.filter = NULL;
# endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
# ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
# endif
atomic_set(&tsk->usage, 2);
# ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
# endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
account_kernel_stack(ti, 1);
return tsk;
free_ti:
free_thread_info(ti);
free_tsk:
free_task_struct(tsk);
return NULL;
}
新进程的PCB和内核堆栈在分配的页表中的分布地址如下所示:
task_struct结构体是按page分配的,多余的部分作为该进程的内核堆栈,从底向task_struct延伸。
新进程的执行
在之前的函数分析已经说明了新进程的堆栈ip指针初始化为ret_from_fork,这是一个汇编程序
在之前的分析中,谈到copy_process中的copy_thread()函数,正是这个函数决定了子进程从系统调用中返回后的执行.
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p)
{
...
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
p->thread.ip = (unsigned long) ret_from_fork;
...
}
ENTRY(ret_from_fork)
CFI_STARTPROC
pushl_cfi %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
popl_cfi %eax
pushl_cfi $0x0202 # Reset kernel eflags
popfl_cfi
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
上述的ret_from_fork就是新进程的执行点。
新进程的内核堆栈初始为父进程的保存现场SAVE_ALL的堆栈数据,所以新进程执行ret_from_fork后有一个RESTORE_ALL,把内核堆栈的数据恢复之后就可以离开内核态进入到用户态执行。总结
1、Linux内核创建一个新进程时有三个系统调用fork(),vfork(),clone()fork和vfork的区别上面已分析。他们之间都是通过do_fork()来创建进程。
2、创建进程往往把父进程的PCB拷贝给子进程,然后再拷贝内核堆栈,子进程需要对其修改并初始化,这样才能是一个可以运行的进程。通常实现的时候是写时复制,就是当子进程需要用到一些数据结构时,才创建一个新的数据结构给它。
3、新进程的执行点是ret_from_fork,恢复堆栈数据后就可以离开内核返回到用户态执行。