一、进程重要字段描述

在目录include\linux\sched.h下定义了进程描述符task_struct，关注如下字段：

进程状态

volatile long state：表示进程状态，在该文件头部有几种状态的取值。

long exit_state：表示进程退出状态，下面的定义中前缀为EXIT的表示该字段取值，表示进程退出状态

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */
#define TASK_RUNNING        0
#define TASK_INTERRUPTIBLE    1
#define TASK_UNINTERRUPTIBLE    2
#define TASK_STOPPED        4
#define TASK_TRACED        8
/* in tsk->exit_state */
#define EXIT_ZOMBIE        16
#define EXIT_DEAD        32
/* in tsk->state again */
#define TASK_NONINTERACTIVE    64

#define __set_task_state(tsk, state_value)        \
    do { (tsk)->state = (state_value); } while (0)
#define set_task_state(tsk, state_value)        \
    set_mb((tsk)->state, (state_value))

/*
 * set_current_state() includes a barrier so that the write of current->state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *    set_current_state(TASK_UNINTERRUPTIBLE);
 *    if (do_i_need_to_sleep())
 *        schedule();
 *
 * If the caller does not need such serialisation then use __set_current_state()
 */
#define __set_current_state(state_value)            \
    do { current->state = (state_value); } while (0)
#define set_current_state(state_value)        \
    set_mb(current->state, (state_value))

statue取值和操作

新加入了一个状态TASK_NONINTERACTIVE？还不明白是干什么的，暂时先记着

标志一个进程

pid_t pid;
pid_t tgid;

用process id，即pid来标志一个进程。POSIX规定一个线程组中所有线程都必须有同一个pid（线程在linux中也被叫做轻量级进程），在linux中所有线程使用该线程组的领头线程（thread group leader）的pid，存放在tgid之中。getpid系统调用返回的是当前进程的tgid而不是pid

线程描述符

struct thread_info *thread_info;

该结构和进程的栈存放在两个连续的页框之内，thread_info放在栈增长方向的顶部，可以根据esp指针找到thread_info（如果栈是8K，那么只需要屏蔽掉esp的后13位），同时根据thread_info中的进程描述符可以快速找到进程的进程描述符。

thread_info定义在在include\asm-i386\thread_info.h之中，该文件还定义了一些对于thread_info的操作函数。

struct thread_info {
    struct task_struct    *task;        /* main task structure */
    struct exec_domain    *exec_domain;    /* execution domain */
    unsigned long        flags;        /* low level flags */
    unsigned long        status;        /* thread-synchronous flags */
    __u32            cpu;        /* current CPU */
    int            preempt_count;    /* 0 => preemptable, <0 => BUG */


    mm_segment_t        addr_limit;    /* thread address space:
                            0-0xBFFFFFFF for user-thead
                           0-0xFFFFFFFF for kernel-thread
                        */
    void            *sysenter_return;
    struct restart_block    restart_block;

    unsigned long           previous_esp;   /* ESP of the previous stack in case
                           of nested (IRQ) stacks
                        */
    __u8            supervisor_stack[0];
};

thread_info

在include\linux\sched.h还定义了thread_union，用来方便的描述线程描述符和内核栈

union thread_union {
    struct thread_info thread_info;
    unsigned long stack[THREAD_SIZE/sizeof(long)];
};

thread_union

进程链表

struct list_head tasks; 连接所有的进程描述符

在include\linux\list.h之中定义了linux的两种链表list_head和hlist_head、hlist_node和相关操作。再次不赘述，看task_struct中部分链表

在include\linux\sched.h还定义了用来遍历整个进程描述符的宏for_each_process，初始进程init_task

运行进程的链表

struct list_head run_list;

这个字段在调度的时候使用，当寻找下一个可运行的进程的时候就使用这个字段

进程间的关系

父子关系涉及到如下4个字段：

struct task_struct *real_parent;　　父进程
struct task_struct *parent;　　在发出ptrace调用时和real_parent不一致
struct list_head children;　　子进程链表的头
struct list_head sibling;　　子进程链表的下一个子进程

而进程间有非亲属关系，涉及的字段如下：

struct task_struct *group_leader;　　指向线程组的进程描述符

struct list_head thread_group;　　线程组的链表

pid_t signal->pgrp;　　所在线程组领头线程的PID

pid_t signal->session;　　会话领头进程的pid

struct list_head ptrace_children;　　跟踪子进程的链表头

struct list_head ptrace_list;　　跟踪子进程的链表节点

进程的等待队列

在include\linux\wait.h中定义了两个结构__wait_queue_head和__wait_queue，分别表示等待队列的头和等待队列的节点

struct __wait_queue_head {
    spinlock_t lock;
    struct list_head task_list;
};

__wait_queue_head

struct __wait_queue {
    unsigned int flags;
#define WQ_FLAG_EXCLUSIVE    0x01
    void *private;
    wait_queue_func_t func;
    struct list_head task_list;
};

__wait_queue

__wait_queue_head中包含一个自旋锁和一个指针

__wait_queue中第一个参数flags表示互斥（进程等待互斥的访问同一资源）还是非互斥进程（比如等待磁盘传输结束的所有进程），第二个参数为void指针，指向的应该是下一个进程的进程描述符（书上这一部分写的是task_struct指针，不清楚到这一版本怎么改成了这个），func字段表示的是如何唤醒进程（类型定义也在该文件之中），task_list字段把等待相同事件的进程串联起来

该文件中还定义了很多等待队列的操作，具体看文件，不再赘述

进程资源限制

在进程描述符中有一个信号描述符

struct signal_struct *signal;

该结构同样定义在本文件中，在signal_struct中有一个如下字段，在信号描述符中似乎还定义了很多和信号无关的数据，这里的资源限制，上面的一些pid值，我还不是很清楚为什么要这么做，是应为锁的缘故吗？信号描述符好像是没有锁的，而锁在signal_handle之中？

struct rlimit rlim[RLIM_NLIMITS];

struct rlimit定义在include\linux\resource.h之中，两个字段分别表示当前资源数和最大资源数

struct rlimit {
    unsigned long    rlim_cur;
    unsigned long    rlim_max;
};

rlimit

在include\asm-generic\resource.h之中的定义了很多的以RLIMIT开头宏，用来访问各种资源的限制

进程所属用户信息

在include\linux\sched.h之中，定义了一个user_struct的结构。一个用户不止拥有一个进程，多个进程间可以通过user_struct结构来共享用户的信息

struct user_struct {
    atomic_t __count;    /* reference count */
    atomic_t processes;    /* How many processes does this user have? */
    atomic_t files;        /* How many open files does this user have? */
    atomic_t sigpending;    /* How many pending signals does this user have? */
#ifdef CONFIG_INOTIFY_USER
    atomic_t inotify_watches; /* How many inotify watches does this user have? */
    atomic_t inotify_devs;    /* How many inotify devs does this user have opened? */
#endif
    /* protected by mq_lock    */
    unsigned long mq_bytes;    /* How many bytes can be allocated to mqueue? */
    unsigned long locked_shm; /* How many pages of mlocked shm ? */

#ifdef CONFIG_KEYS
    struct key *uid_keyring;    /* UID specific keyring */
    struct key *session_keyring;    /* UID's default session keyring */
#endif

    /* Hash table maintenance information */
    struct list_head uidhash_list;
    uid_t uid;
};

user_struct

进程切换时硬件上下文的保存

不是如intel所设计的那样，为每个进程设置了tss字段，linux为一个cpu保留一个tss段，当发生异常和中断的时候，会根据tss中字段做一些操作，操作系统将硬件上下文保存在进程描述符的一个字段中，当进程被切换上cpu时，使用下面字段的部分值去更改tss段。

struct thread_struct thread;

thread_struct定义在include\asm-i386\processor.h之中

struct thread_struct {
/* cached TLS descriptors. */
    struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
    unsigned long    esp0;
    unsigned long    sysenter_cs;
    unsigned long    eip;
    unsigned long    esp;
    unsigned long    fs;
    unsigned long    gs;
/* Hardware debugging registers */
    unsigned long    debugreg[8];  /* %%db0-7 debug registers */
/* fault info */
    unsigned long    cr2, trap_no, error_code;
/* floating point info */
    union i387_union    i387;
/* virtual 86 mode info */
    struct vm86_struct __user * vm86_info;
    unsigned long        screen_bitmap;
    unsigned long        v86flags, v86mask, saved_esp0;
    unsigned int        saved_fs, saved_gs;
/* IO permissions */
    unsigned long    *io_bitmap_ptr;
     unsigned long    iopl;
/* max allowed port in the bitmap, in bytes: */
    unsigned long    io_bitmap_max;
};

thread_struct

二、重要流程描述

2.1 fork、vfork、clone系统调用，创建一个子进程

fork、vfork、clone调用的系统服务例程在arch\i386\process.c之中，该文件还包括进程的其他系统调用的服务例程如exec等。

asmlinkage int sys_fork(struct pt_regs regs)
{
    return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}

sys_fork

asmlinkage int sys_clone(struct pt_regs regs)
{
    unsigned long clone_flags;
    unsigned long newsp;
    int __user *parent_tidptr, *child_tidptr;

    clone_flags = regs.ebx;
    newsp = regs.ecx;
    parent_tidptr = (int __user *)regs.edx;
    child_tidptr = (int __user *)regs.edi;
    if (!newsp)
        newsp = regs.esp;
    return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
}

sys_clone

asmlinkage int sys_vfork(struct pt_regs regs)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}

sys_vfork

其调用的do_fork在kernel\fork.c之中。

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    struct pid *pid = alloc_pid();
    long nr;

    if (!pid)
        return -EAGAIN;
    nr = pid->nr;
    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }

    p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        if (!(clone_flags & CLONE_STOPPED))
            wake_up_new_task(p, clone_flags);
        else
            p->state = TASK_STOPPED;

        if (unlikely (trace)) {
            current->ptrace_message = nr;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            wait_for_completion(&vfork);
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
                current->ptrace_message = nr;
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
            }
        }
    } else {
        free_pid(pid);
        nr = PTR_ERR(p);
    }
    return nr;
}

do_fork

do_fork有6个参数，含义如下：

1、clone_flags：各种信息，分为两部分，最低的字节为信号类型，用来规定子进程去世时应该向父进程发出的信号，高位是在include\linux\sched.h之中定义了很多以CLONE_开头的宏，代表不同的含义（具体见文件内，这里不再描述）。在sys_fork调用do_fork时使用了信号SIGCHLD，该信号是子进程停止、结束、或是在被跟踪时获得。

2、stack_start：将父进程用户态栈的地址传入

3、regs：寄存器值

4、stack_size：栈大小

5、parent_tidptr：创建子进程后，将子进程的pid写到该地址，该指针指向父进程的一块地址

6、child_tidptr：

接着分析do_fork所做的工作

a、调用了alloc_pid函数分配了一个pid。（接下来需要具体分析各个pid的情况，linux内核设计与实现进程篇里有一点信息，见kernel\pid.c文件）

b、查看进程的ptrace字段，如果置位表示有进程想跟踪子进程，则置位clone_flags的CLONE_PTRACE位。（查看fork_traceflag函数可知跟踪子进程对vfork、fork、clone都做了区分，还涉及到SIGCHLD的判断）

c、调用了copy_process函数复制进程，返回进程的进程描述符。copy_process是一个比较复杂的函数，下面单独看看。

d、接着使用IS_ERR宏判断copy_process返回的指针是否正确（是否位于最后一个页），该宏定义在include\linux\err.h中，包括下面用到的PTR_ERR宏（将指针转成错误号），用来处理指针错误，可直接搜索这个字段。

e、如果必须要跟踪子进程，即设置了PT_PTRACED，或者设置了CLONE_STOPPED，则给子进程增加一个SIGSTOP信号，并设置信号标志位

f、如果没有设置CLONE_STOPPED，则调用wake_up_new_task直接唤醒新的进程，函数实现在kernel\sched.c之中，等到调度时再看吧。

g、如果设置了CLONE_STOPPED，则进程状态设置为TASK_STOPPED

h、如果设置了trace，则把子进程的pid放入父进程的ptrace_message，并调用ptrace_notify，这是信号部分的函数，在\kernel\signal.c中，它使当前进程停止运行，向当前进程的父进程，也就是debugger进程发送SIGTRAP信号，并且可以在ptrace_message中找到子进程的pid。

i、如果设置了CLONE_VFORK，则让父进程挂起，直到子进程结束。wait_for_completion在看调度的时候在再具体看下。接下来这段代码应该停下来了，直到子进程执行完下面的代码才会被执行，下面的内容还是执行ptrace_notify函数。

2.2 再看fork、vfork、clone系统调用的区别

clone系统调用是功能最为强大的一个，他调用的do_fork函数所有参数都来自于调用者传入，可以实现任何程度上的进程复制。

fork系统调用无参数，它的clone_flags被置为空，值规定了子进程退出时向父进程发送SIGCHLD信号。同时还有写时复制的原因在里面。

vfork系统调用则在fork系统调用的基础上加了CLONE_VFORK和CLONE_VM位，CLONE_VM位表示共享内存描述符和所有页表，即共享了所有的地址空间，同时CLONE_VFORK表示子进程运行时让父进程挂起，直到子进程结束。在网上看到vfork系统调用出来的子进程不能使用return，也不能使用exit，但能使用_exit，不知道为什么，还需要思考一下。是因为子进程使用return和exit之后会释放掉父进程的栈空间，导致父进程不能继续执行了吗？也就是说子进程和父进程的执行步骤还是不一样的，到底有哪些事件一样，哪些事件不一样还需要好好看一看exit和_exit的区别。

2.3 copy_process函数复制进程

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    struct pt_regs *regs,
                    unsigned long stack_size,
                    int __user *parent_tidptr,
                    int __user *child_tidptr,
                    int pid)
{
    int retval;
    struct task_struct *p = NULL;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

#ifdef CONFIG_TRACE_IRQFLAGS
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
    if (atomic_read(&p->user->processes) >=
            p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                p->user != &root_user)
            goto bad_fork_free;
    }

    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);
    get_group_info(p->group_info);

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;

    p->did_exec = 0;
    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    copy_flags(clone_flags, p);
    p->pid = pid;
    retval = -EFAULT;
    if (clone_flags & CLONE_PARENT_SETTID)
        if (put_user(p->pid, parent_tidptr))
            goto bad_fork_cleanup_delays_binfmt;

    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    clear_tsk_thread_flag(p, TIF_SIGPENDING);
    init_sigpending(&p->pending);

    p->utime = cputime_zero;
    p->stime = cputime_zero;
     p->sched_time = 0;
    p->rchar = 0;        /* I/O counter: bytes read */
    p->wchar = 0;        /* I/O counter: bytes written */
    p->syscr = 0;        /* I/O counter: read syscalls */
    p->syscw = 0;        /* I/O counter: write syscalls */
    acct_clear_integrals(p);

     p->it_virt_expires = cputime_zero;
    p->it_prof_expires = cputime_zero;
     p->it_sched_expires = 0;
     INIT_LIST_HEAD(&p->cpu_timers[0]);
     INIT_LIST_HEAD(&p->cpu_timers[1]);
     INIT_LIST_HEAD(&p->cpu_timers[2]);

    p->lock_depth = -1;        /* -1 = no lock */
    do_posix_clock_monotonic_gettime(&p->start_time);
    p->security = NULL;
    p->io_context = NULL;
    p->io_wait = NULL;
    p->audit_context = NULL;
    cpuset_fork(p);
#ifdef CONFIG_NUMA
     p->mempolicy = mpol_copy(p->mempolicy);
     if (IS_ERR(p->mempolicy)) {
         retval = PTR_ERR(p->mempolicy);
         p->mempolicy = NULL;
         goto bad_fork_cleanup_cpuset;
     }
    mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
#endif

    rt_mutex_init_task(p);

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif

    p->tgid = p->pid;
    if (clone_flags & CLONE_THREAD)
        p->tgid = current->tgid;

    if ((retval = security_task_alloc(p)))
        goto bad_fork_cleanup_policy;
    if ((retval = audit_alloc(p)))
        goto bad_fork_cleanup_security;
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_audit;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_keys(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    if ((retval = copy_namespace(clone_flags, p)))
        goto bad_fork_cleanup_keys;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_namespace;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;

    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /*
     * Syscall tracing should be turned off in the child regardless
     * of CLONE_PTRACE.
     */
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif

    /* Our parent execution domain becomes current domain
       These must match for thread signalling to apply */
       
    p->parent_exec_id = p->self_exec_id;

    /* ok, now we should be set up.. */
    p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
    p->pdeath_signal = 0;
    p->exit_state = 0;

    /*
     * Ok, make it visible to the rest of the system.
     * We dont wake it up yet.
     */
    p->group_leader = p;
    INIT_LIST_HEAD(&p->thread_group);
    INIT_LIST_HEAD(&p->ptrace_children);
    INIT_LIST_HEAD(&p->ptrace_list);

    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(p, clone_flags);

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);

    /*
     * The task hasn't been attached yet, so its cpus_allowed mask will
     * not be changed, nor will its assigned CPU.
     *
     * The cpus_allowed mask of the parent may have changed after it was
     * copied first time - so re-copy it here, then check the child's CPU
     * to ensure it is on a valid CPU (and if not, just force it back to
     * parent's CPU). This avoids alot of nasty races.
     */
    p->cpus_allowed = current->cpus_allowed;
    if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
            !cpu_online(task_cpu(p))))
        set_task_cpu(p, smp_processor_id());

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
        p->real_parent = current->real_parent;
    else
        p->real_parent = current;
    p->parent = p->real_parent;

    spin_lock(&current->sighand->siglock);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
      */
     recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_cleanup_namespace;
    }

    if (clone_flags & CLONE_THREAD) {
        p->group_leader = current->group_leader;
        list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);

        if (!cputime_eq(current->signal->it_virt_expires,
                cputime_zero) ||
            !cputime_eq(current->signal->it_prof_expires,
                cputime_zero) ||
            current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
            !list_empty(&current->signal->cpu_timers[0]) ||
            !list_empty(&current->signal->cpu_timers[1]) ||
            !list_empty(&current->signal->cpu_timers[2])) {
            /*
             * Have child wake up on its first tick to check
             * for process CPU timers.
             */
            p->it_prof_expires = jiffies_to_cputime(1);
        }
    }

    /*
     * inherit ioprio
     */
    p->ioprio = current->ioprio;

    if (likely(p->pid)) {
        add_parent(p);
        if (unlikely(p->ptrace & PT_PTRACED))
            __ptrace_link(p, current->parent);

        if (thread_group_leader(p)) {
            p->signal->tty = current->signal->tty;
            p->signal->pgrp = process_group(current);
            p->signal->session = current->signal->session;
            attach_pid(p, PIDTYPE_PGID, process_group(p));
            attach_pid(p, PIDTYPE_SID, p->signal->session);

            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            __get_cpu_var(process_counts)++;
        }
        attach_pid(p, PIDTYPE_PID, p->pid);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    return p;

bad_fork_cleanup_namespace:
    exit_namespace(p);
bad_fork_cleanup_keys:
    exit_keys(p);
bad_fork_cleanup_mm:
    if (p->mm)
        mmput(p->mm);
bad_fork_cleanup_signal:
    cleanup_signal(p);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_security:
    security_task_free(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
    mpol_free(p->mempolicy);
bad_fork_cleanup_cpuset:
#endif
    cpuset_exit(p);
bad_fork_cleanup_delays_binfmt:
    delayacct_tsk_free(p);
    if (p->binfmt)
        module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
    module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
    put_group_info(p->group_info);
    atomic_dec(&p->user->processes);
    free_uid(p->user);
bad_fork_free:
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}

copy_process

函数执行步骤如下：

1、首先判断clone_flags的各种位

CLONE_NEWNS和CLONE_FS标志不能同时被设置，看文件系统时再来看这部分
CLONE_THREAD表示把子进程插入到父进程同一线程组中，CLONE_SIGHAND共享信号处理的表、阻塞信号的表、挂起信号的表。
设置了CLONE_SIGHAND位必须设置CLONE_VM位，共享内存描述符和所有页表

2、security_task_create是钩子函数，后面再看

3、dup_task_struct函数为子进程获取进程描述符

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;

    prepare_to_copy(orig);

    tsk = alloc_task_struct();
    if (!tsk)
        return NULL;

    ti = alloc_thread_info(tsk);
    if (!ti) {
        free_task_struct(tsk);
        return NULL;
    }

    *tsk = *orig;
    tsk->thread_info = ti;
    setup_thread_stack(tsk, orig);

    /* One for us, one for whoever does the "release_task()" (usually parent) */
    atomic_set(&tsk->usage,2);
    atomic_set(&tsk->fs_excl, 0);
    tsk->btrace_seq = 0;
    tsk->splice_pipe = NULL;
    return tsk;
}

dup_task_struct

dup_task_struct函数执行如下步骤：

a、prepare_to_copy定义在arch\i386\kernel\process.c之中，直接调用unlazy_fpu，在include\asm-i386\i387.h中，所做的工作是将FPU、MMX、SSE\SSE2寄存器的内容保存到父进程的thread_info之中。稍后会将他们复制到子进程之中。是保存到thread_info的哪个字段啊？最后__unlazy_fpu函数的实现也比较复杂，thread_info的字段好像也，没有相关的字段。

b、调用了alloc_task_struct复制了一个进程描述符，在fork.c之中

# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)

alloc_task_struct

c、接着是include\asm-i386\thread_info.h中的alloc_thread_info，分配一个新的进程描述符。

#define alloc_thread_info(tsk)                    \
    ({                            \
        struct thread_info *ret;            \
                                \
        ret = kmalloc(THREAD_SIZE, GFP_KERNEL);        \
        if (ret)                    \
            memset(ret, 0, THREAD_SIZE);        \
        ret;                        \
    })
#else

alloc_thread_info

c、接着是复制了整个进程描述符的内容。

d、调用setup_thread_stack函数，在include\linux\sched.h之中，注意这个函数的参数，第一个是新建的进程描述符指针tsk，但是此时还是指向了原进程描述符的内容，只是tsk的thread_info结构重新建了一个。第二个参数是当前调用进程的进程描述符指针。

static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
    *task_thread_info(p) = *task_thread_info(org);
    task_thread_info(p)->task = p;
}

setup_thread_stack

可以看到其实这里的thread_info结构其实也只是做了简单的数据内容复制。然后将thread_info中的task指针指向了新建的task_struct结构。

e、uasge表示该进程描述符的使用次数，设为2，而且进程处于活动状态

f、fs_excl、btrace_seq、splice_pipe、这些个字段也不知道干嘛用的、文件系统时再看看吧

进程描述符的创建就结束了，下面回到copy_process函数中去。

4、使用p->user->processes和p->signal->rlim[RLIMIT_NPROC].rlim_cur进行比较。signal->rlim数组已经描述过，表示资源限制，RLIMIT_NPROC表示最大进程数。user是定义在进程描述符同一文件中的user_struct结构，表示一个用户的多进程间共享的用户信息。这里比较用户当前拥有进程数是否大于最大进程数。至于这个capability又是另外一块了，下次再看。

5、递增引用数和拥有进程数

6、get_group_info就是递增了一下group_info的引用，应该是进程组共享的信息，在include\linux\sched.h之中

7、拿nr_threads和max_threads来比较，看系统进程数目是否大于最大值，原则是所有进程的内核栈空间不能超过物理内存的1/8。

8、exec_domin是执行域，和模块等相关概念可以深入理解一下。

9、did_exec记录的是发出exec的次数，置为0。然后调用copy_flags更改了一些标志位。

static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
    unsigned long new_flags = p->flags;

    new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
    new_flags |= PF_FORKNOEXEC;
    if (!(clone_flags & CLONE_PTRACE))
        p->ptrace = 0;
    p->flags = new_flags;
}

copy_flags

相关标志位在 include\linux\sched.h中，PF_SUPERPRIV表示使用了超级用户权限，PF_NOFREEZE表示进程不能被冻结，PF_FORKNOEXEC表示没有发出过exec系统调用。CLONE_PTRACE表示跟踪新建的子进程，并且进程描述符中的ptrace字段表示进程被跟踪。

10、设置pid，设置了CLONE_PARENT_SETTID位的话，将子进程的pid写入到指针处（该地址是clone传入的参数，有do_fork传给copy_process，是父进程用户空间的一块地址）

11、vfork_done是在vfork是时候使用的一个指针，在前面的do_fork有涉及到，是有关调度的。

12、初始化进程描述符中的children和sibling指针，使用INIT_LIST_HEAD函数

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list;
    list->prev = list;
}

INIT_LIST_HEAD

13、初始化allo_lock自旋锁、取消信号标志位、将私有信号队列初始化、初始化各种时间变量

14、这部分很多个变量用途还不是很清楚，以后再看

15、进程的线程组号设为进程pid号，这说明新建的进程而不是线程，如果是新建线程，则设置CLONE_THREAD位，tgid的值就应该设为父进程的pid号。设置了CLONE_THREAD位，就应该设置信号共享位，同时应该设置CLONE_VM位，共享内存描述符和所有页表。

16、接着是调用了很多的copy函数，就是创建一下新的结构。

17、看copy_thread函数，用clone传入的寄存器值和clone_flags去初始化子进程的内核栈。

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
    unsigned long unused,
    struct task_struct * p, struct pt_regs * regs)
{
    struct pt_regs * childregs;
    struct task_struct *tsk;
    int err;

    childregs = task_pt_regs(p);
    *childregs = *regs;
    childregs->eax = 0;
    childregs->esp = esp;

    p->thread.esp = (unsigned long) childregs;
    p->thread.esp0 = (unsigned long) (childregs+1);

    p->thread.eip = (unsigned long) ret_from_fork;

    savesegment(fs,p->thread.fs);
    savesegment(gs,p->thread.gs);

    tsk = current;
    if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
        p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
        if (!p->thread.io_bitmap_ptr) {
            p->thread.io_bitmap_max = 0;
            return -ENOMEM;
        }
        memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
            IO_BITMAP_BYTES);
        set_tsk_thread_flag(p, TIF_IO_BITMAP);
    }

    /*
     * Set a new TLS for the child thread?
     */
    if (clone_flags & CLONE_SETTLS) {
        struct desc_struct *desc;
        struct user_desc info;
        int idx;

        err = -EFAULT;
        if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
            goto out;
        err = -EINVAL;
        if (LDT_empty(&info))
            goto out;

        idx = info.entry_number;
        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
            goto out;

        desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
        desc->a = LDT_entry_a(&info);
        desc->b = LDT_entry_b(&info);
    }

    err = 0;
 out:
    if (err && p->thread.io_bitmap_ptr) {
        kfree(p->thread.io_bitmap_ptr);
        p->thread.io_bitmap_max = 0;
    }
    return err;
}

copy_thread

a、调用了task_pt_regs函数，该函数返回了内核栈中保存的pt_regs地址。该函数定义在include\asm-i386\process.h之中。

/*
 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
 * This is necessary to guarantee that the entire "struct pt_regs"
 * is accessable even if the CPU haven't stored the SS/ESP registers
 * on the stack (interrupt gate does not save these registers
 * when switching to the same priv ring).
 * Therefore beware: accessing the xss/esp fields of the
 * "struct pt_regs" is possible, but they may contain the
 * completely wrong values.
 */
#define task_pt_regs(task)                                             \
({                                                                     \
       struct pt_regs *__regs__;                                       \
       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
       __regs__ - 1;                                                   \
})

#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
#define KSTK_TOP(info)                                                 \
({                                                                     \
       unsigned long *__ptr = (unsigned long *)(info);                 \
       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
})

task_pt_regs

其中KSTK_TOP就是把指针调整到栈底，这里用了数组的方式应该是为了来适配不同的long所占的不同字节数，这里THREAD_SIZE一般是两个页，有几个函数和宏定义在include\linux\sched.h，不多赘述了。

这个地方需要特别注意的地方是减8这个操作，上面的代码中有一段注释，说是在中断门的时候系统不会自动保存ss和esp寄存器，为了能完整的访问整个结构体，所以预留下了8个字节的空位。看include\asm-i386\ptrace.h中关于pt_regs的定义，发现ss和esp正好是最后两个，即靠近栈底的一边。

struct pt_regs {
    long ebx;
    long ecx;
    long edx;
    long esi;
    long edi;
    long ebp;
    long eax;
    int  xds;
    int  xes;
    long orig_eax;
    long eip;
    int  xcs;
    long eflags;
    long esp;
    int  xss;
};

pt_regs

b、复制整个寄存器的内容，但是eax的值置为0，这是为了让子进程返回的时候不返回pid的值，fork使用的时候就有这个规则。

c、接着是更改了子进程的sp等一系列进程，这里在切换的时候仔细再看。其中子进程设置成了ret_from_fork函数，汇编函数定义在arch\i386\kernel\entry.S之中。

d、下面的savesegment定义在include\asm-i386\system.h中

#define savesegment(seg, value) \
    asm volatile("mov %%" #seg ",%0":"=rm" (value))

savesegment

也就是保存段寄存器到thread对应位置中去。

e、关于io位图的后面再看

f、下面一段是创建TLS。后面再看吧。

copy_thread完了，回到copy_process函数

18、CLONE_CHILD_SETTID和前面的CLONE_PARENT_SETTID类似，如果设置了还要在子进程的用户空间的某个地址写入pid，这个地址也是和parent一致，由clone调用传入。但是这里的clear暂时还不清楚，这里也没有及时写入该值，为什么？

19、这里还有几个变量不明白什么意思

20、设置信号处理栈的地方，在vfork调用的时候是同时设置了CLONE_VFORK和CLONE_VM，这里要求只使用了CLONE_VM，还不明白有哪些情况。以及为什么。

21、TIF_SYSCALL_TRACE这个位干啥的现在也还不清楚，跟踪系统调用。

22、下面是初始化调度的部分。

23、recalc_sigpending用来查看当前是否有信号，如果有，则此时还不能将新创建的进程加入到进程组，fork发出之前的信号不应该被传送给新的进程。

24、后面实在看不下去了，字段都完全不明白，先留着，后面在再做打算。

2.4 exit和exit_group系统调用退出进程

先看exit的实现，sys_exit定义在kernel\exit.c中

asmlinkage long sys_exit(int error_code)
{
    do_exit((error_code&0xff)<<8);
}

sys_exit

调用了do_exit函数来执行，实际上所有进程终止都是使用do_exit函数，包括exit_group的实现。

fastcall NORET_TYPE void do_exit(long code)
{
    struct task_struct *tsk = current;
    struct taskstats *tidstats;
    int group_dead;
    unsigned int mycpu;

    profile_task_exit(tsk);

    WARN_ON(atomic_read(&tsk->fs_excl));

    if (unlikely(in_interrupt()))
        panic("Aiee, killing interrupt handler!");
    if (unlikely(!tsk->pid))
        panic("Attempted to kill the idle task!");
    if (unlikely(tsk == child_reaper))
        panic("Attempted to kill init!");

    if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
        current->ptrace_message = code;
        ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
    }

    /*
     * We're taking recursive faults here in do_exit. Safest is to just
     * leave this task alone and wait for reboot.
     */
    if (unlikely(tsk->flags & PF_EXITING)) {
        printk(KERN_ALERT
            "Fixing recursive fault but reboot is needed!\n");
        if (tsk->io_context)
            exit_io_context();
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
    }

    tsk->flags |= PF_EXITING;

    if (unlikely(in_atomic()))
        printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                current->comm, current->pid,
                preempt_count());

    taskstats_exit_alloc(&tidstats, &mycpu);

    acct_update_integrals(tsk);
    if (tsk->mm) {
        update_hiwater_rss(tsk->mm);
        update_hiwater_vm(tsk->mm);
    }
    group_dead = atomic_dec_and_test(&tsk->signal->live);
    if (group_dead) {
         hrtimer_cancel(&tsk->signal->real_timer);
        exit_itimers(tsk->signal);
    }
    acct_collect(code, group_dead);
    if (unlikely(tsk->robust_list))
        exit_robust_list(tsk);
#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
    if (unlikely(tsk->compat_robust_list))
        compat_exit_robust_list(tsk);
#endif
    if (unlikely(tsk->audit_context))
        audit_free(tsk);
    taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
    taskstats_exit_free(tidstats);

    exit_mm(tsk);

    if (group_dead)
        acct_process();
    exit_sem(tsk);
    __exit_files(tsk);
    __exit_fs(tsk);
    exit_namespace(tsk);
    exit_thread();
    cpuset_exit(tsk);
    exit_keys(tsk);

    if (group_dead && tsk->signal->leader)
        disassociate_ctty(1);

    module_put(task_thread_info(tsk)->exec_domain->module);
    if (tsk->binfmt)
        module_put(tsk->binfmt->module);

    tsk->exit_code = code;
    proc_exit_connector(tsk);
    exit_notify(tsk);
#ifdef CONFIG_NUMA
    mpol_free(tsk->mempolicy);
    tsk->mempolicy = NULL;
#endif
    /*
     * This must happen late, after the PID is not
     * hashed anymore:
     */
    if (unlikely(!list_empty(&tsk->pi_state_list)))
        exit_pi_state_list(tsk);
    if (unlikely(current->pi_state_cache))
        kfree(current->pi_state_cache);
    /*
     * Make sure we are holding no locks:
     */
    debug_check_no_locks_held(tsk);

    if (tsk->io_context)
        exit_io_context();

    if (tsk->splice_pipe)
        __free_pipe_info(tsk->splice_pipe);

    /* PF_DEAD causes final put_task_struct after we schedule. */
    preempt_disable();
    BUG_ON(tsk->flags & PF_DEAD);
    tsk->flags |= PF_DEAD;

    schedule();
    BUG();
    /* Avoid "noreturn function does return".  */
    for (;;) ;
}

do_exit

秒客网

linux-2.6.18源码分析笔记---进程