内核源码:linux-2.6.38.8.tar.bz2
目标平台:ARM体系结构
进程终止时,一般是调用exit库函数(无论是程序员显式调用还是编译器自动地把exit库函数插入到main函数的最后一条语句之后)来释放进程所拥有的资源。
$ man 3 exit
void exit(int status);
$ man 2 exit_group
void exit_group(int status);
$ man 3 pthread_exit
void pthread_exit(void *retval);
$ man 2 _exit
void _exit(int status);
库函数exit使用系统调用exit_group来终止整个线程组,库函数pthread_exit使用系统调用_exit来终止某一个线程。
这两个系统调用在Linux内核中的入口点函数分别为sys_exit和sys_exit_group。
/* linux-2.6.38.8/kernel/exit.c */
SYSCALL_DEFINE1(exit, int, error_code)
{
do_exit((error_code&0xff)<<8);
}
SYSCALL_DEFINE1(exit_group, int, error_code)
{
do_group_exit((error_code & 0xff) << 8);
/* NOTREACHED */
return 0;
}
do_group_exit函数会杀死属于当前进程所在线程组的所有进程。它接受进程终止代号作为参数,进程终止代号可能是系统调用exit_group(正常结束)指定的一个值,也可能是内核提供的一个错误码(异常结束)。
NORET_TYPE void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
if (signal_group_exit(sig)) //检查current->sig->flags的SIGNAL_GROUP_EXIT标志是否置位,或者current->sig->group_exit_task是否不为NULL。
exit_code = sig->group_exit_code; //group_exit_code存放的是线程组终止代码
else if (!thread_group_empty(current)) { //检查线程组链表是否不为空。
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
zap_other_threads(current); //遍历整个线程组链表,并杀死其中的每个线程。
}
spin_unlock_irq(&sighand->siglock);
}
do_exit(exit_code);
/* NOTREACHED */
}
进程终止所要完成的任务都是由do_exit函数来处理。
/* linux-2.6.38.8/kernel/exit.c */
NORET_TYPE void do_exit(long code)
1、触发task_exit_nb通知链实例的处理函数
profile_task_exit(tsk);
/* linux-2.6.38.8/drivers/oprofile/buffer_sync.c */
static struct notifier_block task_exit_nb = {
.notifier_call = task_exit_notify,
};
2、检查current->fs_excl是否为0,不为0时也不会终止后续代码的执行
WARN_ON(atomic_read(&tsk->fs_excl));
/* linux-2.6.38.8/include/asm-generic/bug.h */
#ifndef WARN_ON
#define WARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN(); /* 输出警告信息的位置(哪个文件的哪行)*/ \
unlikely(__ret_warn_on); \
})
#endif
3、oops消息
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
中断上下文不能执行do_exit函数,也不能终止PID为0的进程。
4、设定进程可以使用的虚拟地址的上限(用户空间)
set_fs(USER_DS);
/* linux-2.6.38.8/arch/arm/include/asm/uaccess.h */
#define USER_DS TASK_SIZE
#define TASK_SIZE (UL(CONFIG_PAGE_OFFSET) - UL(0x01000000))
static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
}
5、current->flags的PF_EXITING标志表示进程正在被删除。
if (unlikely(tsk->flags & PF_EXITING)) {//检查PF_EXITING标志是否未被设置,如果设置了则执行大括号里的代码
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
tsk->flags |= PF_EXITPIDONE;
set_current_state(TASK_UNINTERRUPTIBLE); //设置进程状态为不可中断的等待状态
schedule(); //调度其它进程
}
6、设置current->irqaction->flags的IRQTF_DIED标志,表示清除当前进程的中断服务例程
exit_irq_thread();
/* linux-2.6.38.8/kernel/irq/mamage.c */
void exit_irq_thread(void)
{
struct task_struct *tsk = current;
if (!tsk->irqaction)
return;
printk(KERN_ERR
"exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
/*
* Set the THREAD DIED flag to prevent further wakeups of the
* soon to be gone threaded handler.
*/
set_bit(IRQTF_DIED, &tsk->irqaction->flags);
}
7、设置PF_EXITING标志
exit_signals(tsk); /* sets PF_EXITING */
/* linux-2.6.38.8/kernel/signal.c */
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
struct task_struct *t;
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { //检查线程组链表是否为空,或者是否要终止整个线程组
tsk->flags |= PF_EXITING;
return;
}
spin_lock_irq(&tsk->sighand->siglock);
/*
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
tsk->flags |= PF_EXITING;
if (!signal_pending(tsk)) //signal_pending函数用于检查当前进程是否有非阻塞的挂起信号,如果有则返回1,否则返回0
goto out;
/* It could be that __group_complete_signal() choose us to
* notify about group-wide signal. Another thread should be
* woken now to take the signal since we will not.
*/
for (t = tsk; (t = next_thread(t)) != tsk; ) //检查线程组中的其他进程
if (!signal_pending(t) && !(t->flags & PF_EXITING)) //如果没有设置TIF_SIGPENDING标志,而且也没有设置PF_EXITING标志
recalc_sigpending_and_wake(t); //则设置TIF_SIGPENDING标志,表示有挂起信号
if (unlikely(tsk->signal->group_stop_count) &&
!--tsk->signal->group_stop_count) { //表示只终止线程组中的某个线程
tsk->signal->flags = SIGNAL_STOP_STOPPED;
group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
}
out:
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(tsk, group_stop);
read_unlock(&tasklist_lock);
}
}
8)、内存屏障,用于确保在它之后的操作开始执行之前,它之前的操作已经完成
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock); //一直等待,直到获得current->pi_lock自旋锁
/* linux-2.6.38.8/arch/arm/include/asm/system.h */
#define smp_mb() barrier() //!CONFIG_SMP
/* linux-2.6.38.8/include/linux/compiler-gcc.h */
#define barrier() __asm__ __volatile__("": : :"memory")
9)、获取current->mm->rss_stat.count[member]计数
acct_update_integrals(tsk);
void acct_update_integrals(struct task_struct *tsk)
{
if (likely(tsk->mm)) {
cputime_t time, dtime;
struct timeval value;
unsigned long flags;
u64 delta;
local_irq_save(flags);
time = tsk->stime + tsk->utime;
dtime = cputime_sub(time, tsk->acct_timexpd);
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
delta = delta * USEC_PER_SEC + value.tv_usec;
if (delta == 0)
goto out;
tsk->acct_timexpd = time;
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); //统计分配给进程的页框数(MM_FILEPAGES和MM_ANONPAGES两种类型的页框)
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; //total_vm用来表示进程地址空间的大小(页数)
out:
local_irq_restore(flags);
}
}
/* linux-2.6.38.8/include/linux/mm.h */
static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
return get_mm_counter(mm, MM_FILEPAGES) +
get_mm_counter(mm, MM_ANONPAGES);
}
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) //!USE_SPLIT_PTLOCKS
{
return mm->rss_stat.count[member];
}
然后,把它们清零。
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk, tsk->mm);
/* linux-2.6.38.8/mm/memory.c */
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
__sync_task_rss_stat(task, mm);
}
static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) { //共有三类,MM_FILEPAGES、MM_ANONPAGES和MM_SWAPENTS
if (task->rss_stat.count[i]) {
add_mm_counter(mm, i, task->rss_stat.count[i]);
task->rss_stat.count[i] = 0;
}
}
task->rss_stat.events = 0;
}
10)、清除定时器
group_dead = atomic_dec_and_test(&tsk->signal->live); //live用来表示线程组中活动进程的数量
if (group_dead) { //当没有活动的进程时
hrtimer_cancel(&tsk->signal->real_timer); //取消高精度定时器
exit_itimers(tsk->signal); //删除POSIX.1b类型的定时器
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); //获取进程所拥有的最大页框数
}
11)、收集进程会计信息
acct_collect(code, group_dead);
12)、审计
if (group_dead)
tty_audit_exit(); //记录审计事件
if (unlikely(tsk->audit_context))
audit_free(tsk); //释放struct audit_context结构体
13)、输出taskstats信息
tsk->exit_code = code; //设置终止代码
taskstats_exit(tsk, group_dead);
14)、释放线性区描述符和页表
exit_mm(tsk);
/* linux-2.6.38.8/kernel/exit.c */
static void exit_mm(struct task_struct * tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
mm_release(tsk, mm); //其中会唤醒tsk->vfork_done,让父进程开始执行,用于vfork时
if (!mm)
return;
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
down_read(&mm->mmap_sem);
core_state = mm->core_state;
if (core_state) { //内存转储
struct core_thread self;
up_read(&mm->mmap_sem);
self.task = tsk;
self.next = xchg(&core_state->dumper.next, &self);
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count); //递增mm->mm_count计数,确保内存描述符暂时不会被删除,当要把正在被终止的进程从本地CPU撤销时,才由finish_task_switch函数来释放内存描述。
BUG_ON(mm != tsk->active_mm);
/* more a memory barrier than a real lock */
task_lock(tsk);
tsk->mm = NULL; //设置进程描述符的mm字段为NULL。
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current); //使处理器处于懒惰TLB模式,ARM体系结构不支持。
/* We don't want this task to be frozen prematurely */
clear_freeze_flag(tsk); //设置TIF_FREEZE标志。
if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
atomic_dec(&mm->oom_disable_count);
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm); //当mm->mm_users为0(即没有任何进程使用它)时,释放线性区描述符和页表,但这时还不会释放内存描述符
}
15)、输出进程会计信息
if (group_dead)
acct_process();
trace_sched_process_exit(tsk); //用于跟踪,定义在linux-2.6.38.8/include/trace/events/sched.h文件中
16)、遍历current->sysvsem.undo_list链表,并清除进程所涉及的每个IPC信号量的操作痕迹
exit_sem(tsk);
17)、释放文件对象相关资源
exit_files(tsk);
/* linux-2.6.38.8/kernel/exit.c */
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
if (files) {
task_lock(tsk);
tsk->files = NULL; //把进程描述符的files字段设为NULL。
task_unlock(tsk);
put_files_struct(files);
}
}
void put_files_struct(struct files_struct *files)
{
struct fdtable *fdt;
if (atomic_dec_and_test(&files->count)) { //当共享该表的进程数目为0时
close_files(files); //执行进程终止时应该执行的文件操作相关函数,如release
/*
* Free the fd and fdset arrays if we expanded them.
* If the fdtable was embedded, pass files for freeing
* at the end of the RCU grace period. Otherwise,
* you can free files immediately.
*/
rcu_read_lock();
fdt = files_fdtable(files);
if (fdt != &files->fdtab)
kmem_cache_free(files_cachep, files); //释放struct files_struct结构体所用内存
free_fdtable(fdt);
rcu_read_unlock();
}
}
18)、释放struct fs_struct结构体
exit_fs(tsk);
/* linux-2.6.38.8/fs/fs_struct.c */
void exit_fs(struct task_struct *tsk)
{
struct fs_struct *fs = tsk->fs;
if (fs) {
int kill;
task_lock(tsk);
spin_lock(&fs->lock);
write_seqcount_begin(&fs->seq);
tsk->fs = NULL; //设置进程描述符的fs字段为NULL
kill = !--fs->users; //fs->users表示共享这个表的进程个数
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
task_unlock(tsk);
if (kill) //当为0时
free_fs_struct(fs); //释放结构体所用内存
}
}
19)、检查有多少未使用的进程内核栈
check_stack_usage();
/* linux-2.6.38.8/kernel/exit.c */
#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
unsigned long free;
free = stack_not_used(current);
if (free >= lowest_to_date)
return;
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
"left\n",
current->comm, free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif
20)、触发thread_notify_head链表中所有通知链实例的处理函数,用于处理struct thread_info结构体
exit_thread();
/* linux-2.6.38.8/arch/arm/kernel/process.c */
void exit_thread(void)
{
thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
}
21)、Performance Event功能相关资源的释放
perf_event_exit_task(tsk);
/* linux-2.6.38.8/kernel/perf_event.c */
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
/*
* Ensure the list deletion is visible before we clear
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
smp_wmb();
event->owner = NULL;
}
mutex_unlock(&child->perf_event_mutex);
for_each_task_context_nr(ctxn)
perf_event_exit_task_context(child, ctxn);
}
22)、释放Control Groups相关的资源
cgroup_exit(tsk, 1);
/* linux-2.6.38.8/kernel/cgroup.c */
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
int i;
struct css_set *cg;
if (run_callbacks && need_forkexit_callback) {
/*
* modular subsystems can't use callbacks, so no need to lock
* the subsys array
*/
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->exit)
ss->exit(ss, tsk);
}
}
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
/* Reassign the task to the init_css_set. */
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
task_unlock(tsk);
if (cg)
put_css_set_taskexit(cg);
}
23)、脱离控制终端
if (group_dead)
disassociate_ctty(1);
24)、执行域
module_put(task_thread_info(tsk)->exec_domain->module);
25)、进程事件连接器(通过它来报告进程fork、exec、exit以及进程用户ID与组ID的变化)
proc_exit_connector(tsk);
/* linux-2.6.38.8/drivers/connector/cn_proc.c */
void proc_exit_connector(struct task_struct *task)
{
struct cn_msg *msg;
struct proc_event *ev;
__u8 buffer[CN_PROC_MSG_SIZE];
struct timespec ts;
if (atomic_read(&proc_event_num_listeners) < 1)
return;
msg = (struct cn_msg*)buffer;
ev = (struct proc_event*)msg->data;
get_seq(&msg->seq, &ev->cpu);
ktime_get_ts(&ts); /* get high res monotonic timestamp */
put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
ev->what = PROC_EVENT_EXIT;
ev->event_data.exit.process_pid = task->pid;
ev->event_data.exit.process_tgid = task->tgid;
ev->event_data.exit.exit_code = task->exit_code;
ev->event_data.exit.exit_signal = task->exit_signal;
memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
msg->ack = 0; /* not used */
msg->len = sizeof(*ev);
cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
}
参考文档:linux-2.6.38.8/Documentation/connector/connector.txt
http://www.ibm.com/developerworks/cn/linux/l-connector/
26)、注销断点
ptrace_put_breakpoints(tsk);
/* linux-2.6.38.8/kernel/ptrace.c */
void ptrace_put_breakpoints(struct task_struct *tsk)
{
if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
flush_ptrace_hw_breakpoint(tsk);
}
/* linux-2.6.38.8/arch/arm/kernel/ptrace.c */
void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
{
int i;
struct thread_struct *t = &tsk->thread;
for (i = 0; i < ARM_MAX_HBP_SLOTS; i++) {
if (t->debug.hbp[i]) {
unregister_hw_breakpoint(t->debug.hbp[i]);
t->debug.hbp[i] = NULL;
}
}
}
27)、更新所有子进程的父进程
exit_notify(tsk, group_dead);
/* linux-2.6.38.8/kernel/exit.c */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
int signal;
void *cookie;
/*
* This does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk); //将子进程的父进程重新设置为线程组中的其他线程或init进程
exit_task_namespaces(tsk); //当使用计数(current->nsproxy->count)为0时,释放命名空间(current->nsproxy)
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
/* Let father know we died
*
* Thread signals are configurable, but you aren't going to use
* that to send signals to arbitary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
* when we started then we know the parent has changed security
* domain.
*
* If our self_exec id doesn't match our parent_exec_id then
* we have changed execution domain as these two values started
* the same after a fork.
*/
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && //task_detached函数用于判断tsk->exit_signal是否等于-1
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD; //设置SIGCHLD信号
signal = tracehook_notify_death(tsk, &cookie, group_dead); //判断当前进程是否被跟踪
if (signal >= 0)
signal = do_notify_parent(tsk, signal); //告知父进程当前进程死亡
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; //当tsk->exit_signal不等于-1,或进程正在被跟踪,则设置tsk->exit_state为EXIT_ZOMBIE
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
tracehook_report_death(tsk, signal, cookie, group_dead);
/* If the process is dead, release it - nobody will wait for it */
if (signal == DEATH_REAP) //如果tsk->exit_state为EXIT_DEAD状态
release_task(tsk); //则调用release_task函数回收进程的其他数据结构所占用的内存
}
28)、用于NUMA,当引用计数为0时,释放struct mempolicy结构体所占用的内存
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
#endif
29)、释放struct futex_pi_state结构体所占用的内存
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
30)、释放struct io_context结构体所占用的内存
if (tsk->io_context)
exit_io_context(tsk);
/* linux-2.6.38.8/block/blk-ioc.c */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
if (atomic_dec_and_test(&ioc->nr_tasks))
cfq_exit(ioc);
put_io_context(ioc);
}
31)、释放与进程描述符splice_pipe字段相关的资源
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
/* linux-2.6.38.8/fs/pipe.c */
void __free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
buf->ops->release(pipe, buf);
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
kfree(pipe->bufs);
kfree(pipe);
}
32)、调度其它进程
tsk->state = TASK_DEAD; //调度程序忽略处于TASK_DEAD状态的进程
schedule();
在调用do_exit函数之后,尽管进程已经不能再被调度,但系统还是保留了它的进程描述符,这样做是为了让系统有办法在进程终止后仍能获得它的信息。在父进程获得已终止子进程的信息后,子进程的task_struct结构体才被释放(包括此进程的内核栈)。