一、进程重要字段描述
在目录include\linux\sched.h下定义了进程描述符task_struct,关注如下字段:
进程状态
volatile long state:表示进程状态,在该文件头部有几种状态的取值。
long exit_state:表示进程退出状态,下面的定义中前缀为EXIT的表示该字段取值,表示进程退出状态
/* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 #define TASK_STOPPED 4 #define TASK_TRACED 8 /* in tsk->exit_state */ #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ #define TASK_NONINTERACTIVE 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ set_mb((tsk)->state, (state_value)) /* * set_current_state() includes a barrier so that the write of current->state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * set_current_state(TASK_UNINTERRUPTIBLE); * if (do_i_need_to_sleep()) * schedule(); * * If the caller does not need such serialisation then use __set_current_state() */ #define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) #define set_current_state(state_value) \ set_mb(current->state, (state_value))
新加入了一个状态TASK_NONINTERACTIVE?还不明白是干什么的,暂时先记着
标志一个进程
pid_t pid;
pid_t tgid;
用process id,即pid来标志一个进程。POSIX规定一个线程组中所有线程都必须有同一个pid(线程在linux中也被叫做轻量级进程),在linux中所有线程使用该线程组的领头线程(thread group leader)的pid,存放在tgid之中。getpid系统调用返回的是当前进程的tgid而不是pid
线程描述符
struct thread_info *thread_info;
该结构和进程的栈存放在两个连续的页框之内,thread_info放在栈增长方向的顶部,可以根据esp指针找到thread_info(如果栈是8K,那么只需要屏蔽掉esp的后13位),同时根据thread_info中的进程描述符可以快速找到进程的进程描述符。
thread_info定义在在include\asm-i386\thread_info.h之中,该文件还定义了一些对于thread_info的操作函数。
struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ void *sysenter_return; struct restart_block restart_block; unsigned long previous_esp; /* ESP of the previous stack in case of nested (IRQ) stacks */ __u8 supervisor_stack[0]; };
在include\linux\sched.h还定义了thread_union,用来方便的描述线程描述符和内核栈
union thread_union { struct thread_info thread_info; unsigned long stack[THREAD_SIZE/sizeof(long)]; };
进程链表
struct list_head tasks; 连接所有的进程描述符
在include\linux\list.h之中定义了linux的两种链表list_head和hlist_head、hlist_node和相关操作。再次不赘述,看task_struct中部分链表
在include\linux\sched.h还定义了用来遍历整个进程描述符的宏for_each_process,初始进程init_task
运行进程的链表
struct list_head run_list;
这个字段在调度的时候使用,当寻找下一个可运行的进程的时候就使用这个字段
进程间的关系
父子关系涉及到如下4个字段:
struct task_struct *real_parent; 父进程
struct task_struct *parent; 在发出ptrace调用时和real_parent不一致
struct list_head children; 子进程链表的头
struct list_head sibling; 子进程链表的下一个子进程
而进程间有非亲属关系,涉及的字段如下:
struct task_struct *group_leader; 指向线程组的进程描述符
struct list_head thread_group; 线程组的链表
pid_t signal->pgrp; 所在线程组领头线程的PID
pid_t signal->session; 会话领头进程的pid
struct list_head ptrace_children; 跟踪子进程的链表头
struct list_head ptrace_list; 跟踪子进程的链表节点
进程的等待队列
在include\linux\wait.h中定义了两个结构__wait_queue_head和__wait_queue,分别表示等待队列的头和等待队列的节点
struct __wait_queue_head { spinlock_t lock; struct list_head task_list; };
struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; };
__wait_queue_head中包含一个自旋锁和一个指针
__wait_queue中第一个参数flags表示互斥(进程等待互斥的访问同一资源)还是非互斥进程(比如等待磁盘传输结束的所有进程),第二个参数为void指针,指向的应该是下一个进程的进程描述符(书上这一部分写的是task_struct指针,不清楚到这一版本怎么改成了这个),func字段表示的是如何唤醒进程(类型定义也在该文件之中),task_list字段把等待相同事件的进程串联起来
该文件中还定义了很多等待队列的操作,具体看文件,不再赘述
进程资源限制
在进程描述符中有一个信号描述符
struct signal_struct *signal;
该结构同样定义在本文件中,在signal_struct中有一个如下字段,在信号描述符中似乎还定义了很多和信号无关的数据,这里的资源限制,上面的一些pid值,我还不是很清楚为什么要这么做,是应为锁的缘故吗?信号描述符好像是没有锁的,而锁在signal_handle之中?
struct rlimit rlim[RLIM_NLIMITS];
struct rlimit定义在include\linux\resource.h之中,两个字段分别表示当前资源数和最大资源数
struct rlimit { unsigned long rlim_cur; unsigned long rlim_max; };
在include\asm-generic\resource.h之中的定义了很多的以RLIMIT开头宏,用来访问各种资源的限制
进程所属用户信息
在include\linux\sched.h之中,定义了一个user_struct的结构。一个用户不止拥有一个进程,多个进程间可以通过user_struct结构来共享用户的信息
struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif /* protected by mq_lock */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ unsigned long locked_shm; /* How many pages of mlocked shm ? */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ struct key *session_keyring; /* UID's default session keyring */ #endif /* Hash table maintenance information */ struct list_head uidhash_list; uid_t uid; };
进程切换时硬件上下文的保存
不是如intel所设计的那样,为每个进程设置了tss字段,linux为一个cpu保留一个tss段,当发生异常和中断的时候,会根据tss中字段做一些操作,操作系统将硬件上下文保存在进程描述符的一个字段中,当进程被切换上cpu时,使用下面字段的部分值去更改tss段。
struct thread_struct thread;
thread_struct定义在include\asm-i386\processor.h之中
struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; unsigned long esp0; unsigned long sysenter_cs; unsigned long eip; unsigned long esp; unsigned long fs; unsigned long gs; /* Hardware debugging registers */ unsigned long debugreg[8]; /* %%db0-7 debug registers */ /* fault info */ unsigned long cr2, trap_no, error_code; /* floating point info */ union i387_union i387; /* virtual 86 mode info */ struct vm86_struct __user * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, saved_esp0; unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *io_bitmap_ptr; unsigned long iopl; /* max allowed port in the bitmap, in bytes: */ unsigned long io_bitmap_max; };
二、重要流程描述
2.1 fork、vfork、clone系统调用,创建一个子进程
fork、vfork、clone调用的系统服务例程在arch\i386\process.c之中,该文件还包括进程的其他系统调用的服务例程如exec等。
asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); }
asmlinkage int sys_clone(struct pt_regs regs) { unsigned long clone_flags; unsigned long newsp; int __user *parent_tidptr, *child_tidptr; clone_flags = regs.ebx; newsp = regs.ecx; parent_tidptr = (int __user *)regs.edx; child_tidptr = (int __user *)regs.edi; if (!newsp) newsp = regs.esp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); }
asmlinkage int sys_vfork(struct pt_regs regs) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); }
其调用的do_fork在kernel\fork.c之中。
long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; struct pid *pid = alloc_pid(); long nr; if (!pid) return -EAGAIN; nr = pid->nr; if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) clone_flags |= CLONE_PTRACE; } p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); } if (!(clone_flags & CLONE_STOPPED)) wake_up_new_task(p, clone_flags); else p->state = TASK_STOPPED; if (unlikely (trace)) { current->ptrace_message = nr; ptrace_notify ((trace << 8) | SIGTRAP); } if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); } } } else { free_pid(pid); nr = PTR_ERR(p); } return nr; }
do_fork有6个参数,含义如下:
1、clone_flags:各种信息,分为两部分,最低的字节为信号类型,用来规定子进程去世时应该向父进程发出的信号,高位是在include\linux\sched.h之中定义了很多以CLONE_开头的宏,代表不同的含义(具体见文件内,这里不再描述)。在sys_fork调用do_fork时使用了信号SIGCHLD,该信号是子进程停止、结束、或是在被跟踪时获得。
2、stack_start:将父进程用户态栈的地址传入
3、regs:寄存器值
4、stack_size:栈大小
5、parent_tidptr:创建子进程后,将子进程的pid写到该地址,该指针指向父进程的一块地址
6、child_tidptr:
接着分析do_fork所做的工作
a、调用了alloc_pid函数分配了一个pid。(接下来需要具体分析各个pid的情况,linux内核设计与实现进程篇里有一点信息,见kernel\pid.c文件)
b、查看进程的ptrace字段,如果置位表示有进程想跟踪子进程,则置位clone_flags的CLONE_PTRACE位。(查看fork_traceflag函数可知跟踪子进程对vfork、fork、clone都做了区分,还涉及到SIGCHLD的判断)
c、调用了copy_process函数复制进程,返回进程的进程描述符。copy_process是一个比较复杂的函数,下面单独看看。
d、接着使用IS_ERR宏判断copy_process返回的指针是否正确(是否位于最后一个页),该宏定义在include\linux\err.h中,包括下面用到的PTR_ERR宏(将指针转成错误号),用来处理指针错误,可直接搜索这个字段。
e、如果必须要跟踪子进程,即设置了PT_PTRACED,或者设置了CLONE_STOPPED,则给子进程增加一个SIGSTOP信号,并设置信号标志位
f、如果没有设置CLONE_STOPPED,则调用wake_up_new_task直接唤醒新的进程,函数实现在kernel\sched.c之中,等到调度时再看吧。
g、如果设置了CLONE_STOPPED,则进程状态设置为TASK_STOPPED
h、如果设置了trace,则把子进程的pid放入父进程的ptrace_message,并调用ptrace_notify,这是信号部分的函数,在\kernel\signal.c中,它使当前进程停止运行,向当前进程的父进程,也就是debugger进程发送SIGTRAP信号,并且可以在ptrace_message中找到子进程的pid。
i、如果设置了CLONE_VFORK,则让父进程挂起,直到子进程结束。wait_for_completion在看调度的时候在再具体看下。接下来这段代码应该停下来了,直到子进程执行完下面的代码才会被执行,下面的内容还是执行ptrace_notify函数。
2.2 再看fork、vfork、clone系统调用的区别
clone系统调用是功能最为强大的一个,他调用的do_fork函数所有参数都来自于调用者传入,可以实现任何程度上的进程复制。
fork系统调用无参数,它的clone_flags被置为空,值规定了子进程退出时向父进程发送SIGCHLD信号。同时还有写时复制的原因在里面。
vfork系统调用则在fork系统调用的基础上加了CLONE_VFORK和CLONE_VM位,CLONE_VM位表示共享内存描述符和所有页表,即共享了所有的地址空间,同时CLONE_VFORK表示子进程运行时让父进程挂起,直到子进程结束。在网上看到vfork系统调用出来的子进程不能使用return,也不能使用exit,但能使用_exit,不知道为什么,还需要思考一下。是因为子进程使用return和exit之后会释放掉父进程的栈空间,导致父进程不能继续执行了吗?也就是说子进程和父进程的执行步骤还是不一样的,到底有哪些事件一样,哪些事件不一样还需要好好看一看exit和_exit的区别。
2.3 copy_process函数复制进程
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, int pid) { int retval; struct task_struct *p = NULL; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != &root_user) goto bad_fork_free; } atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); get_group_info(p->group_info); /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ if (nr_threads >= max_threads) goto bad_fork_cleanup_count; if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->sched_time = 0; p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ acct_clear_integrals(p); p->it_virt_expires = cputime_zero; p->it_prof_expires = cputime_zero; p->it_sched_expires = 0; INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; p->io_context = NULL; p->io_wait = NULL; p->audit_context = NULL; cpuset_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cpuset; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif rt_mutex_init_task(p); #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_security; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_namespace(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespace; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. * * The cpus_allowed mask of the parent may have changed after it was * copied first time - so re-copy it here, then check the child's CPU * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) p->real_parent = current->real_parent; else p->real_parent = current; p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cleanup_namespace; } if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); if (!cputime_eq(current->signal->it_virt_expires, cputime_zero) || !cputime_eq(current->signal->it_prof_expires, cputime_zero) || current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || !list_empty(¤t->signal->cpu_timers[0]) || !list_empty(¤t->signal->cpu_timers[1]) || !list_empty(¤t->signal->cpu_timers[2])) { /* * Have child wake up on its first tick to check * for process CPU timers. */ p->it_prof_expires = jiffies_to_cputime(1); } } /* * inherit ioprio */ p->ioprio = current->ioprio; if (likely(p->pid)) { add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); p->signal->session = current->signal->session; attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, p->pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); return p; bad_fork_cleanup_namespace: exit_namespace(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: cleanup_signal(p); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); bad_fork_cleanup_cpuset: #endif cpuset_exit(p); bad_fork_cleanup_delays_binfmt: delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: put_group_info(p->group_info); atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: free_task(p); fork_out: return ERR_PTR(retval); }
函数执行步骤如下:
1、首先判断clone_flags的各种位
- CLONE_NEWNS和CLONE_FS标志不能同时被设置,看文件系统时再来看这部分
- CLONE_THREAD表示把子进程插入到父进程同一线程组中,CLONE_SIGHAND共享信号处理的表、阻塞信号的表、挂起信号的表。
- 设置了CLONE_SIGHAND位必须设置CLONE_VM位,共享内存描述符和所有页表
2、security_task_create是钩子函数,后面再看
3、dup_task_struct函数为子进程获取进程描述符
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; prepare_to_copy(orig); tsk = alloc_task_struct(); if (!tsk) return NULL; ti = alloc_thread_info(tsk); if (!ti) { free_task_struct(tsk); return NULL; } *tsk = *orig; tsk->thread_info = ti; setup_thread_stack(tsk, orig); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); tsk->btrace_seq = 0; tsk->splice_pipe = NULL; return tsk; }
dup_task_struct函数执行如下步骤:
a、prepare_to_copy定义在arch\i386\kernel\process.c之中,直接调用unlazy_fpu,在include\asm-i386\i387.h中,所做的工作是将FPU、MMX、SSE\SSE2寄存器的内容保存到父进程的thread_info之中。稍后会将他们复制到子进程之中。是保存到thread_info的哪个字段啊?最后__unlazy_fpu函数的实现也比较复杂,thread_info的字段好像也,没有相关的字段。
b、调用了alloc_task_struct复制了一个进程描述符,在fork.c之中
# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
c、接着是include\asm-i386\thread_info.h中的alloc_thread_info,分配一个新的进程描述符。
#define alloc_thread_info(tsk) \ ({ \ struct thread_info *ret; \ \ ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ if (ret) \ memset(ret, 0, THREAD_SIZE); \ ret; \ }) #else
c、接着是复制了整个进程描述符的内容。
d、调用setup_thread_stack函数,在include\linux\sched.h之中,注意这个函数的参数,第一个是新建的进程描述符指针tsk,但是此时还是指向了原进程描述符的内容,只是tsk的thread_info结构重新建了一个。第二个参数是当前调用进程的进程描述符指针。
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; }
可以看到其实这里的thread_info结构其实也只是做了简单的数据内容复制。然后将thread_info中的task指针指向了新建的task_struct结构。
e、uasge表示该进程描述符的使用次数,设为2,而且进程处于活动状态
f、fs_excl、btrace_seq、splice_pipe、这些个字段也不知道干嘛用的、文件系统时再看看吧
进程描述符的创建就结束了,下面回到copy_process函数中去。
4、使用p->user->processes和p->signal->rlim[RLIMIT_NPROC].rlim_cur进行比较。signal->rlim数组已经描述过,表示资源限制,RLIMIT_NPROC表示最大进程数。user是定义在进程描述符同一文件中的user_struct结构,表示一个用户的多进程间共享的用户信息。这里比较用户当前拥有进程数是否大于最大进程数。至于这个capability又是另外一块了,下次再看。
5、递增引用数和拥有进程数
6、get_group_info就是递增了一下group_info的引用,应该是进程组共享的信息,在include\linux\sched.h之中
7、拿nr_threads和max_threads来比较,看系统进程数目是否大于最大值,原则是所有进程的内核栈空间不能超过物理内存的1/8。
8、exec_domin是执行域,和模块等相关概念可以深入理解一下。
9、did_exec记录的是发出exec的次数,置为0。然后调用copy_flags更改了一些标志位。
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; p->flags = new_flags; }
相关标志位在 include\linux\sched.h中,PF_SUPERPRIV表示使用了超级用户权限,PF_NOFREEZE表示进程不能被冻结,PF_FORKNOEXEC表示没有发出过exec系统调用。CLONE_PTRACE表示跟踪新建的子进程,并且进程描述符中的ptrace字段表示进程被跟踪。
10、设置pid,设置了CLONE_PARENT_SETTID位的话,将子进程的pid写入到指针处(该地址是clone传入的参数,有do_fork传给copy_process,是父进程用户空间的一块地址)
11、vfork_done是在vfork是时候使用的一个指针,在前面的do_fork有涉及到,是有关调度的。
12、初始化进程描述符中的children和sibling指针,使用INIT_LIST_HEAD函数
static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; }
13、初始化allo_lock自旋锁、取消信号标志位、将私有信号队列初始化、初始化各种时间变量
14、这部分很多个变量用途还不是很清楚,以后再看
15、进程的线程组号设为进程pid号,这说明新建的进程而不是线程,如果是新建线程,则设置CLONE_THREAD位,tgid的值就应该设为父进程的pid号。设置了CLONE_THREAD位,就应该设置信号共享位,同时应该设置CLONE_VM位,共享内存描述符和所有页表。
16、接着是调用了很多的copy函数,就是创建一下新的结构。
17、看copy_thread函数,用clone传入的寄存器值和clone_flags去初始化子进程的内核栈。
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; struct task_struct *tsk; int err; childregs = task_pt_regs(p); *childregs = *regs; childregs->eax = 0; childregs->esp = esp; p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) { struct desc_struct *desc; struct user_desc info; int idx; err = -EFAULT; if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) goto out; err = -EINVAL; if (LDT_empty(&info)) goto out; idx = info.entry_number; if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) goto out; desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; desc->a = LDT_entry_a(&info); desc->b = LDT_entry_b(&info); } err = 0; out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } return err; }
a、调用了task_pt_regs函数,该函数返回了内核栈中保存的pt_regs地址。该函数定义在include\asm-i386\process.h之中。
/* * The below -8 is to reserve 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessable even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers * when switching to the same priv ring). * Therefore beware: accessing the xss/esp fields of the * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ #define task_pt_regs(task) \ ({ \ struct pt_regs *__regs__; \ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ __regs__ - 1; \ }) #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) #define KSTK_TOP(info) \ ({ \ unsigned long *__ptr = (unsigned long *)(info); \ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ })
其中KSTK_TOP就是把指针调整到栈底,这里用了数组的方式应该是为了来适配不同的long所占的不同字节数,这里THREAD_SIZE一般是两个页,有几个函数和宏定义在include\linux\sched.h,不多赘述了。
这个地方需要特别注意的地方是减8这个操作,上面的代码中有一段注释,说是在中断门的时候系统不会自动保存ss和esp寄存器,为了能完整的访问整个结构体,所以预留下了8个字节的空位。看include\asm-i386\ptrace.h中关于pt_regs的定义,发现ss和esp正好是最后两个,即靠近栈底的一边。
struct pt_regs { long ebx; long ecx; long edx; long esi; long edi; long ebp; long eax; int xds; int xes; long orig_eax; long eip; int xcs; long eflags; long esp; int xss; };
b、复制整个寄存器的内容,但是eax的值置为0,这是为了让子进程返回的时候不返回pid的值,fork使用的时候就有这个规则。
c、接着是更改了子进程的sp等一系列进程,这里在切换的时候仔细再看。其中子进程设置成了ret_from_fork函数,汇编函数定义在arch\i386\kernel\entry.S之中。
d、下面的savesegment定义在include\asm-i386\system.h中
#define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value))
也就是保存段寄存器到thread对应位置中去。
e、关于io位图的后面再看
f、下面一段是创建TLS。后面再看吧。
copy_thread完了,回到copy_process函数
18、CLONE_CHILD_SETTID和前面的CLONE_PARENT_SETTID类似,如果设置了还要在子进程的用户空间的某个地址写入pid,这个地址也是和parent一致,由clone调用传入。但是这里的clear暂时还不清楚,这里也没有及时写入该值,为什么?
19、这里还有几个变量不明白什么意思
20、设置信号处理栈的地方,在vfork调用的时候是同时设置了CLONE_VFORK和CLONE_VM,这里要求只使用了CLONE_VM,还不明白有哪些情况。以及为什么。
21、TIF_SYSCALL_TRACE这个位干啥的现在也还不清楚,跟踪系统调用。
22、下面是初始化调度的部分。
23、recalc_sigpending用来查看当前是否有信号,如果有,则此时还不能将新创建的进程加入到进程组,fork发出之前的信号不应该被传送给新的进程。
24、后面实在看不下去了,字段都完全不明白,先留着,后面在再做打算。
2.4 exit和exit_group系统调用退出进程
先看exit的实现,sys_exit定义在kernel\exit.c中
asmlinkage long sys_exit(int error_code) { do_exit((error_code&0xff)<<8); }
调用了do_exit函数来执行,实际上所有进程终止都是使用do_exit函数,包括exit_group的实现。
fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; struct taskstats *tidstats; int group_dead; unsigned int mycpu; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(tsk == child_reaper)) panic("Attempted to kill init!"); if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); } /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); if (tsk->io_context) exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } tsk->flags |= PF_EXITING; if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, current->pid, preempt_count()); taskstats_exit_alloc(&tidstats, &mycpu); acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); update_hiwater_vm(tsk->mm); } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif if (unlikely(tsk->audit_context)) audit_free(tsk); taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); exit_mm(tsk); if (group_dead) acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); exit_namespace(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); if (group_dead && tsk->signal->leader) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); if (tsk->binfmt) module_put(tsk->binfmt->module); tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif /* * This must happen late, after the PID is not * hashed anymore: */ if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); /* * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); if (tsk->io_context) exit_io_context(); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); BUG_ON(tsk->flags & PF_DEAD); tsk->flags |= PF_DEAD; schedule(); BUG(); /* Avoid "noreturn function does return". */ for (;;) ; }