Kernel启动流程源码解析 7 rest_init()

时间:2021-06-03 15:13:08

一 rest_init

1.0 rest_init

定义在init/main.c中
static noinline void __init_refok rest_init(void)
{
    int pid;
    const struct sched_param param = { .sched_priority = 1 };

    rcu_scheduler_starting(); // 使能rcu
    /* 
     * We need to spawn init first so that it obtains pid 1, however
     * the init task will end up wanting to create kthreads, which, if
     * we schedule it before we create kthreadd, will OOPS.
     */
    kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); // 创建kernel_init内核 线程,即init,1号进程,但是在kthreadd后运行
    numa_default_policy(); // 设定NUMA系统的默认内存访问策略
    pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); // 创建kthreadd内核线程,2号进程,用于管理和调度其它内核线程。// kthread_create创建的内核线程
    rcu_read_lock();
    kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); // 获取kthreadd的进程描述符
    rcu_read_unlock();
    sched_setscheduler_nocheck(kthreadd_task, SCHED_FIFO, &param);
    complete(&kthreadd_done); // 通知kernel_init进程kthreadd进程已创建完成

    /* 
     * The boot idle thread must execute schedule()
     * at least once to get things moving:
     */
    init_idle_bootup_task(current); // 设置当前进程(0号进程)为idle进程类
    schedule_preempt_disabled(); // 主动调用进程调度,并禁止内核抢占
    /* Call into cpu_idle with preempt disabled */
    cpu_startup_entry(CPUHP_ONLINE); // 0号进程完成kernel初始化的工作,进入idle循环,化身idle进程
}


1.1  rcu_scheduler_starting

定义在kernel/rcutree.c中
void rcu_scheduler_starting(void)
{
    WARN_ON(num_online_cpus() != 1); // 确保当前只启动了一个cpu核
    WARN_ON(nr_context_switches() > 0); // 确保之前没有进行进程上下文切换
    rcu_scheduler_active = 1; // 使能rcu机制
}


1.2 kernel_thread

定义在kernel/fork.c
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
    return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
        (unsigned long)arg, NULL, NULL);
}

1.3 kthreadd

定义在init/main.c中
int kthreadd(void *unused)
{
    struct task_struct *tsk = current;

    /* Setup a clean context for our children to inherit. */
    set_task_comm(tsk, "kthreadd");
    ignore_signals(tsk);
    set_cpus_allowed_ptr(tsk, cpu_all_mask);
    set_mems_allowed(node_states[N_MEMORY]);

    current->flags |= PF_NOFREEZE;

    for (;;) {
        set_current_state(TASK_INTERRUPTIBLE);
        if (list_empty(&kthread_create_list))
            schedule();
        __set_current_state(TASK_RUNNING);

        spin_lock(&kthread_create_lock);
        while (!list_empty(&kthread_create_list)) {
            struct kthread_create_info *create;

            create = list_entry(kthread_create_list.next,
                        struct kthread_create_info, list);
            list_del_init(&create->list);
            spin_unlock(&kthread_create_lock);

            create_kthread(create);

            spin_lock(&kthread_create_lock);
        }
        spin_unlock(&kthread_create_lock);
    }

    return 0;
}


1.4 rcu_read_lock & rcu_read_unlock

定义在include/linux/rcupdate.h中
static inline void rcu_read_lock(void)
{
    __rcu_read_lock();
    __acquire(RCU);
    rcu_lock_acquire(&rcu_lock_map);
    rcu_lockdep_assert(!rcu_is_cpu_idle(),
               "rcu_read_lock() used illegally while idle");
}

static inline void rcu_read_unlock(void)
{
    rcu_lockdep_assert(!rcu_is_cpu_idle(),
               "rcu_read_unlock() used illegally while idle");
    rcu_lock_release(&rcu_lock_map);
    __release(RCU);
    __rcu_read_unlock();
}

1.5 find_task_by_pid_ns

定义在kernel/pid.c中
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
    rcu_lockdep_assert(rcu_read_lock_held(),
               "find_task_by_pid_ns() needs rcu_read_lock()"
               " protection");
    return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct pid_namespace init_pid_ns = {
    .kref = {
        .refcount       = ATOMIC_INIT(2),
    },
    .pidmap = {
        [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
    },
    .last_pid = 0,
    .level = 0,
    .child_reaper = &init_task,
    .user_ns = &init_user_ns,
    .proc_inum = PROC_PID_INIT_INO,
};

1.6 sched_setscheduler_nocheck

定义在kernel/sched/core.c中
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                   const struct sched_param *param)
{
    return __sched_setscheduler(p, policy, param, false);
}

1.7 kthreadd_done

定义在init/main.c中
static __initdata DECLARE_COMPLETION(kthreadd_done);

#define DECLARE_COMPLETION(work) \
    struct completion work = COMPLETION_INITIALIZER(work)



1.8 init_idle_bootup_task

定义在kernel/sched/core.c中
void __cpuinit init_idle_bootup_task(struct task_struct *idle)
{
    idle->sched_class = &idle_sched_class;
}

#define get_current() (current_thread_info()->task)
#define current get_current()

1.9 schedule_preempt_disabled

void __sched schedule_preempt_disabled(void)
{
    sched_preempt_enable_no_resched(); // 内核抢占计数preempt_count减1,但不立即抢占式调度
    schedule(); // 并主动请求调度,让出cpu,1号进程kernel_init将会运行
    preempt_disable(); // 禁止抢占
}

#define sched_preempt_enable_no_resched() \
do { \
    barrier(); \
    dec_preempt_count(); \
} while (0)

#define preempt_disable() \
do { \
    inc_preempt_count(); \
    barrier(); \
} while (0)

1.10 cpu_startup_entry

定义在cpu_startup_entry中
void cpu_startup_entry(enum cpuhp_state state)
{
    /*
     * This #ifdef needs to die, but it's too late in the cycle to
     * make this generic (arm and sh have never invoked the canary
     * init for the non boot cpus!). Will be fixed in 3.11
     */
#ifdef CONFIG_X86
    /*
     * If we're the non-boot CPU, nothing set the stack canary up
     * for us. The boot CPU already has it initialized but no harm
     * in doing it again. This is a good place for updating it, as
     * we wont ever return from this function (so the invalid
     * canaries already on the stack wont ever trigger).
     */
    boot_init_stack_canary();
#endif
    __current_set_polling();
    arch_cpu_idle_prepare();
    cpu_idle_loop(); // 0号进程进入idle循环
}


二 kernel_init

2.0 kernel_init

定义在init/main.c中
static int __ref kernel_init(void *unused)
{
   kernel_init_freeable();  // 重要,下面有详细说明
    /* need to finish all async __init code before freeing the memory */
    async_synchronize_full(); // 等待所有异步调用执行完成
    free_initmem(); // 释放所有init.* 段中的内存
    mark_rodata_ro(); // arm64为空
    system_state = SYSTEM_RUNNING; // 设置系统状态为运行状态
    numa_default_policy();

    flush_delayed_fput(); // 同步所有延时fput

    if (ramdisk_execute_command) {
        if (!run_init_process(ramdisk_execute_command)) // do_execve(“/init”)  // 运行init程序,从一个内核进程变成用户进程
            return 0;
        pr_err("Failed to execute %s\n", ramdisk_execute_command);
    }

    /*
     * We try each of these until one succeeds.
     *
     * The Bourne shell can be used instead of init if we are
     * trying to recover a really broken machine.
     */
    if (execute_command) {
        if (!run_init_process(execute_command))
            return 0;
        pr_err("Failed to execute %s.  Attempting defaults...\n",
            execute_command);
    }
    if (!run_init_process("/sbin/init") ||
        !run_init_process("/etc/init") ||
        !run_init_process("/bin/init") ||
        !run_init_process("/bin/sh"))
        return 0;

    panic("No init found.  Try passing init= option to kernel. "
          "See Linux Documentation/init.txt for guidance.");
}


2.1 kernel_init_freeable

定义在init/main.c中
static noinline void __init kernel_init_freeable(void)
{
    /*
     * Wait until kthreadd is all set-up.
     */
    wait_for_completion(&kthreadd_done); // 等待kthreadd_done完成量,其实是在等待kthreadd进程创建完成

    /* Now the scheduler is fully set up and can do blocking allocations */
    gfp_allowed_mask = __GFP_BITS_MASK; //

    /*
     * init can allocate pages on any node
     */
    set_mems_allowed(node_states[N_MEMORY]); // 设置init进程可以分配的物理页面
    /*
     * init can run on any cpu.
     */
    set_cpus_allowed_ptr(current, cpu_all_mask); // 通过设置cpu_bit_mask, 使init进程可以在任意cpu上运行

    cad_pid = task_pid(current); //cadctrl-alt-del 设置init进程来处理 ctrl-alt-del信号

    smp_prepare_cpus(setup_max_cpus); // 对 全部可用cpu核 调用cpu_prepare函数,并将其设为present状态

    do_pre_smp_initcalls(); // 调用level小于0的initcall函数
    lockup_detector_init(); // 使能watchdog

    smp_init(); // 启动cpu0外的其他cpu核
    sched_init_smp(); // 进程调度域初始化

   do_basic_setup(); // 重要,下面有详细说明

    /* Open the /dev/console on the rootfs, this should never fail */
    if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) // 打开 /dev/console ,文件号0,作为init进程标准输入
        pr_err("Warning: unable to open an initial console.\n");

    (void) sys_dup(0); // 标准输入
    (void) sys_dup(0); // 标准输出
    /*
     * check if there is an early userspace init.  If yes, let it do all
     * the work
     */

    if (!ramdisk_execute_command)
        ramdisk_execute_command = "/init”; // init程序

    if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
        ramdisk_execute_command = NULL;
        prepare_namespace();
    }
    /*
     * Ok, we have completed the initial bootup, and
     * we're essentially up and running. Get rid of the
     * initmem segments and start the user-mode stuff..
     */

    /* rootfs is available now, try loading default modules */
    load_default_modules(); // 加载IO调度的电梯算法
}

static void __init do_basic_setup(void)
{
    cpuset_init_smp(); // 初始化内核control group的cpuset子系统
    usermodehelper_init(); // 创建khelper单线程工作队列,用于协助新建和运行用户空间程序
    shmem_init(); // 初始化共享内存
   driver_init(); // 初始化设备驱动
    init_irq_proc(); // 创建/proc/irq目录, 并初始化系统中所有中断对应的子目录
    do_ctors(); // 执行内核的构造函数
    usermodehelper_enable(); // 使能usermodehelper
    do_initcalls(); // 调用level 0到level 7的initcall函数,依次的level名称是"early", "core", "postcore", "arch", "subsys", "fs", "device", “late”,需要注意的kernel在这块的命名有些问题,early_initcall 对应的level小于0,pure_initcall对应level才是0
    random_int_secret_init(); 初始化随机数生成池
}

void __init driver_init(void)
{
    /* These are the core pieces */
    devtmpfs_init(); // 注册devtmpfs文件系统,启动kdevtmpfs进程
    devices_init(); // 初始化驱动模型中的部分子系统,kset: devices 和  kobject:dev、 dev/block、 dev/char
    buses_init(); // 初始化驱动模型中的bus子系统,kset:bus、devices/system
    classes_init(); // 初始化驱动模型中的class子系统,kset:class
    firmware_init(); // 初始化驱动模型中的firmware子系统 ,kobject:firmware

    hypervisor_init(); // 初始化驱动模型中的hypervisor子系统,kobject:hypervisor

    /* These are also core pieces, but must come after the
     * core core pieces.
     */
    platform_bus_init(); // 初始化驱动模型中的bus/platform子系统
    cpu_dev_init(); // 初始化驱动模型中的devices/system/cpu子系统
    memory_dev_init();  // 当前为空函数
    container_dev_init(); // 初始化驱动模型中的devices/system/container子系统
}


2.2 free_initmem

定义在arch/arm64/mm/init.c中
void free_initmem(void)
{   
    poison_init_mem(__init_begin, __init_end - __init_begin);
    free_initmem_default(0);
}

2.3 run_init_process

定义在init/main.c中
static int run_init_process(const char *init_filename)
{       
    argv_init[0] = init_filename;
    return do_execve(init_filename,
        (const char __user *const __user *)argv_init,
        (const char __user *const __user *)envp_init);
}

static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };