【进程管理】进程(线程)创建

时间:2023-02-10 23:48:25

          本节主要研究进程(线程)创建的过程,下文将不区分进程和线程;


基本知识

在linux系统中,第一个进程是系统固有的,是由内核的设计者安排好的;一个新的进程一定要由一个已存在的进程复制出来,而不是创造出来的,其实linux系统并不提供直接创建进程的方法;创建了子进程以后,父进程可以继续走自己的路,与子进程分道扬镳,但是如果子进程先行exit(),那么将要向父进程发一个信号;父进程也可以选择睡眠,等子进程exit()以后再去世,然后父进程再继续执行,可使用wait3()某个特定的子进程,wait4()所有子进程;第三,自己exit()(是每一个可执行程序映像所必有的,因此在子进程中执行完后,不会返回);linux将进程的创建与目标程序的执行分成两步;

(1)从一个已存在的父进程像细胞分裂一样地复制出一个子进程;实际复制出来的子进程有自己的task_struct和系统空间堆栈,但是与父进程共享其他资源;例如,要是父进程打开了5个文件,那么子进程也打开了这5个文件,而且这些文件的读写位置处于相同的位置;fork()是全部复制,父进程的所有资源全部通过数据结构复制给子进程,但进程号不一样;clone()则带有参数的选择性的复制,可复制出一个线程其他资源通过指针与父亲来共享;vfork()是除了task_struct和系统空间堆栈外的资源通过指针全部复制,因此复制出来的是个线程,效率很高

(2)目标程序的执行,创建一个进程是为有不同的目标程序要让新的程序去执行,但复制完以后,子进程就要与父进程分道扬镳了,用execve()执行以文件形式存在的可执行程序映像;

在(1)中,复制时只复制进程基本资源,如task_struct,系统空间堆栈,页面表等,不包括父进程的代码和全局变量,这些通过只读方式的共享,在需要写的时候,通过copy_on_write()为所涉及的页面建立一个新的副本;


fork,vfork,clone

(1)clone()主要是用来创建一个线程,包括用户线程和内核线程;创建用户线程时,可以给定子线程用户空间堆栈位置,它也可以用来创建进程,有选择性的复制父进程的资源;fork()则是全面的复制vfork()是为了提高创建时的效率,减少系统开销

(2)Linux内核中确实有一个创建内核线程的函数,kernel_thread(),供内核线程调用,它是对clone()的包装,并不执行execve(),而是执行内核中某一个函数,会返回因此要执行一个exit()系统调用;

(3)fork,vfork,clone这三个系统调用都调用do_fork(),只不过调用的参数不一样,下面主要来讲解do_fork();

int sys_fork(struct pt_regs *regs)
{
//clone_flags中的SIGCHLD
return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}

int sys_vfork(struct pt_regs *regs){	//共享CLONE_VFORK和VM	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,		       NULL, NULL);}

//clone负责建立起轻量级进程(可以与其他进程共享地址空间,或打开文件等),newsp是指用户堆栈指针,parent_tid表示父进程的//的用户变量地址,child_tid表示新的轻量级进程的用户变量地址:longsys_clone(unsigned long clone_flags, unsigned long newsp,	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs){	if (!newsp)		newsp = regs->sp;  //有新的用户栈地址	//其中clone_flags一般有参数SIGCHLD,占用一个字节,剩余的3个字节可制定,如共享内存描述符,页表,文件目录,信号处理标,跟踪等	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);}

说明几点

(1)newsp为子进程新的栈,该栈可能在另一个地址空间;


/*
* Create a kernel thread
*/
int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct pt_regs regs;

memset(&regs, 0, sizeof(regs));

regs.si = (unsigned long) fn;
regs.di = (unsigned long) arg;

#ifdef CONFIG_X86_32
regs.ds = __USER_DS;
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;:
regs.gs = __KERNEL_STACK_CANARY;
#else
regs.ss = __KERNEL_DS;
#endif

regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
regs.flags = X86_EFLAGS_IF | 0x2;

/* Ok, create the new process.. */
//其中CLONE_VM避免调用进程的页表,内核线程是不用访问用户态的地址空间;不会被跟踪的
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
}


do_fork

/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
//sys_clone
//regs是指通用寄存器指针,它是一个轻量级进程在用户态切换到内核态,保存到内核堆栈中
long do_fork(unsigned long clone_flags,
unsigned long stack_start, //用户状态下栈的起始地址
struct pt_regs *regs, //指向寄存器集合的指针
unsigned long stack_size, //用户状态下,栈的大小
int __user *parent_tidptr, //指向用户空间中地址的两个指针
int __user *child_tidptr)
{
struct task_struct *p;
int trace = 0;
long nr;

/*
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
if (clone_flags & CLONE_NEWUSER) { //创建新的用户
if (clone_flags & CLONE_THREAD) //但是没有创建新的线程
return -EINVAL;
/* hopefully this check will go away when userns support is
* complete
*/
if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
!capable(CAP_SETGID))
return -EPERM;
}

/*
* When called from kernel_thread, don't do user tracing stuff.
*/
if (likely(user_mode(regs)))
trace = tracehook_prepare_clone(clone_flags);

//执行生成新进程的实际工作
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;

trace_sched_process_fork(current, p);

nr = task_pid_vnr(p); //获得当前的局部nr

if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr); //将nr复制到对应的用户空间指向的地址

if (clone_flags & CLONE_VFORK) { //如果是执行vfork这个函数,父进程会睡眠下去
p->vfork_done = &vfork;
init_completion(&vfork); //睡眠,此时父进程等子进程
}
//schedule_tail
audit_finish_fork(p);
tracehook_report_clone(regs, clone_flags, nr, p);

/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
* hasn't gotten to tracehook_report_clone() yet. Now we
* clear it and set the child going.
*/
p->flags &= ~PF_STARTING;

wake_up_new_task(p, clone_flags); //将子进程的task_struct放入到新调度器队列中

tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);

//如果设置了CLONE_VFORK,
//就把父进程插入到等待队列中,直到子进程释放了自己的内存地址空间(也就是子进程结束或执行新的程序)
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork); //父进程将在改变量上进入睡眠状态
freezer_count();
tracehook_report_vfork_done(p, nr);
}
} else {
nr = PTR_ERR(p);
}
return nr;
}
说明几点

(1)p = copy_process(clone_flags, stack_start, regs, stack_size,  child_tidptr, NULL, trace);执行实际的进程复制工作;

(2)if (clone_flags & CLONE_VFORK) 表示如果是执行vfork这个函数,父进程会睡眠下去;


copy_process中关键代码1

设置task_struct和系统堆栈

	//task_struct可以在内存中的任何位置
p = dup_task_struct(current); //为子进程获取并设置进程描述符,并且设置好了thread_info
if (!p)
goto fork_out;


static struct task_struct *dup_task_struct(struct task_struct *orig){	struct task_struct *tsk;	struct thread_info *ti;	unsigned long *stackend;	int err;	prepare_to_copy(orig); //保存FPU等寄存器内容到thread_info中	tsk = alloc_task_struct();	//kem, 获取新的进程描述符task_struct的内存	if (!tsk)		return NULL;	ti = alloc_thread_info(tsk); //task无用处,使用get_free_pages获得两个页大小的内存	if (!ti) {	//ti若分配失败,还要释放原内存		free_task_struct(tsk);			return NULL;	} 	err = arch_dup_task_struct(tsk, orig);	//将旧的task_struct复制给新的task_struct	if (err)		goto out;	tsk->stack = ti; //改变新进程的stack指向到新的thread_info中	err = prop_local_init_single(&tsk->dirties);	if (err)		goto out;	setup_thread_stack(tsk, orig);	//链接task_struct和thread_info,确定内存布局,相互指向	clear_user_return_notifier(tsk);	clear_tsk_need_resched(tsk);	stackend = end_of_stack(tsk);	*stackend = STACK_END_MAGIC;	/* for overflow detection */#ifdef CONFIG_CC_STACKPROTECTOR	tsk->stack_canary = get_random_int();#endif	/* One for us, one for whoever does the "release_task()" (usually parent) */	atomic_set(&tsk->usage,2);  //要将新进程的使用计数置为2	atomic_set(&tsk->fs_excl, 0);#ifdef CONFIG_BLK_DEV_IO_TRACE	tsk->btrace_seq = 0;#endif	tsk->splice_pipe = NULL;	account_kernel_stack(ti, 1);	return tsk;out:	free_thread_info(ti);	free_task_struct(tsk);	return NULL;}

int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src){	int ret;	*dst = *src;	//拷贝两个进程描述符号	if (fpu_allocated(&src->thread.fpu)) {	//若源的fpu设置了,那么还要分配thread的fpu内存		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));	//清空目的的fpu		ret = fpu_alloc(&dst->thread.fpu);		if (ret)			return ret;		fpu_copy(&dst->thread.fpu, &src->thread.fpu);	//分配好了,直接拷贝	}	return 0;}


copy_process中关键代码2

一些字段的设置

	p->did_exec = 0;	//记录进程发出execve()的次数
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);

INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);

rcu_copy_process(p);

p->vfork_done = NULL;

spin_lock_init(&p->alloc_lock);

init_sigpending(&p->pending); //初始化好信号处理

//初始化cpu的统计字段
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;

copy_process中关键代码2

设置子进程的调度信息

	/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags); //完成对新进程调度程序数据结构的初始化

copy_process中关键代码3

复制和共享进程的各个部分

	if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_policy;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p))) //进程地址空间的处理
goto bad_fork_cleanup_signal;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;

//设置子进程的内核栈
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);


copy_files

static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
int error = 0;

/*
* A background process may not have any files ...
*/
oldf = current->files; //原进程的files_struct
if (!oldf)
goto out;

if (clone_flags & CLONE_FILES) { //共享打开的文件表
atomic_inc(&oldf->count); //增加引用计数
goto out;
}

newf = dup_fd(oldf, &error);
if (!newf)
goto out;

tsk->files = newf;
error = 0;
out:
return error;
}


copy_thread

int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
{
struct pt_regs *childregs;
struct task_struct *tsk;
int err;

//填充包含了所有的寄存器
childregs = task_pt_regs(p);
*childregs = *regs;
childregs->ax = 0; //子进程的返回值,为0
childregs->sp = sp; //子进程的用户空间栈地址

p->thread.sp = (unsigned long) childregs; //指向子进程的用户空间
p->thread.sp0 = (unsigned long) (childregs+1); //指向子进程系统空间堆栈中的pt_regs

p->thread.ip = (unsigned long) ret_from_fork; //子进程开始调用的函数

task_user_gs(p) = get_user_gs(regs);

p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;

memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}

err = 0;

/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);

if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}

copy_process中关键代码4

获得子进程pid

	if (pid != &init_struct_pid) {
retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns); //分配好一个pid
if (!pid)
goto bad_fork_cleanup_io;

if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
if (retval < 0)
goto bad_fork_free_pid;
}
}

p->pid = pid_nr(pid); //得到全局的nr
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid; //设置好线程组id

if (current->nsproxy != p->nsproxy) {
retval = ns_cgroup_clone(p, pid);
if (retval)
goto bad_fork_free_pid;
}

//改变子进程用户地址空间的child_tidptr的内存值
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; //也保存在对应的值上面去
/*
* Clear TID on mm_release()?
*/
//在mm_release时,将0写到child_tidptr中去
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;


copy_process中关键代码5

线程还是进程

	/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { //在父进程的同一进程组,同一个父亲
p->real_parent = current->real_parent; //当前线程和创建的线程的父亲同一个线程
p->parent_exec_id = current->parent_exec_id;
} else { //否则real_parent指向本进程
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}

线程

	if (clone_flags & CLONE_THREAD) {  //子进程放入到同一线程组去
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
atomic_inc(¤t->signal->sigcnt);
p->group_leader = current->group_leader; //指向线程组组长
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); //加入到线程组中去
}

execve

/*
* sys_execve() executes a new program.
*/
long sys_execve(const char __user *name,
const char __user *const __user *argv,
const char __user *const __user *envp, struct pt_regs *regs)
{
long error;
char *filename;

filename = getname(name); //找到可执行文件名
error = PTR_ERR(filename);
if (IS_ERR(filename))
return error;
error = do_execve(filename, argv, envp, regs);

#ifdef CONFIG_X86_32
if (error == 0) {
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
}
#endif

putname(filename);
return error;
}

do_execve中关键代码

	file = open_exec(filename);	//找到file
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;

sched_exec();

bprm->file = file; //设置相应的参数
bprm->filename = filename; //名称
bprm->interp = filename;

retval = bprm_mm_init(bprm);
if (retval)
goto out_file;

bprm->argc = count(argv, MAX_ARG_STRINGS); //计算长度

	retval = search_binary_handler(bprm,regs);  //用于找到一种适当的二进制格式,如a.out, elf格式等


int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
unsigned int depth = bprm->recursion_depth;
int try,retval;
struct linux_binfmt *fmt;

retval = security_bprm_check(bprm);
if (retval)
return retval;

/* kernel module loader fixup */
/* so we don't try to load run modprobe in kernel space. */
set_fs(USER_DS);

retval = audit_bprm(bprm);
if (retval)
return retval;

retval = -ENOENT;
for (try=0; try<2; try++) { //装入模块后,需再尝试一次
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
if (!fn)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
retval = fn(bprm, regs); //执行对应的装入函数 <span style="font-family: Arial, Helvetica, sans-serif;">load_aout_binary</span>
说明几点

(1)load_aout_binary为a.out可执行文件格式的装入,此外还支持elf和脚本等格式文件的装入;