Kernel启动流程源码解析 7 rest_init()

一 rest_init

1.0 rest_init

定义在init/main.c中

static noinline void __init_refok rest_init(void)

{

int pid;

const struct sched_param param = { .sched_priority = 1 };

rcu_scheduler_starting(); // 使能rcu

* We need to spawn init first so that it obtains pid 1, however

* the init task will end up wanting to create kthreads, which, if

* we schedule it before we create kthreadd, will OOPS.

kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); // 创建kernel_init内核线程，即init，1号进程，但是在kthreadd后运行

numa_default_policy(); // 设定NUMA系统的默认内存访问策略

pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); // 创建kthreadd内核线程，2号进程，用于管理和调度其它内核线程。// kthread_create创建的内核线程

rcu_read_lock();

kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); // 获取kthreadd的进程描述符

rcu_read_unlock();

sched_setscheduler_nocheck(kthreadd_task, SCHED_FIFO, &param);

complete(&kthreadd_done); // 通知kernel_init进程kthreadd进程已创建完成

* The boot idle thread must execute schedule()

* at least once to get things moving:

init_idle_bootup_task(current); // 设置当前进程（0号进程）为idle进程类

schedule_preempt_disabled(); // 主动调用进程调度，并禁止内核抢占

/* Call into cpu_idle with preempt disabled */

cpu_startup_entry(CPUHP_ONLINE); // 0号进程完成kernel初始化的工作，进入idle循环，化身idle进程

}

1.1 rcu_scheduler_starting

定义在kernel/rcutree.c中

   void rcu_scheduler_starting(void) 
 
   { 
 
       WARN_ON(num_online_cpus() != 1); // 确保当前只启动了一个cpu核 
 
       WARN_ON(nr_context_switches() > 0); // 确保之前没有进行进程上下文切换 
 
       rcu_scheduler_active = 1; // 使能rcu机制 
 
   }

1.2 kernel_thread

定义在kernel/fork.c

   pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 
 
   { 
 
       return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, 
 
           (unsigned long)arg, NULL, NULL); 
 
   }

1.3 kthreadd

定义在init/main.c中

   int kthreadd(void *unused) 
 
   { 
 
       struct task_struct *tsk = current; 
 
       /* Setup a clean context for our children to inherit. */ 
 
       set_task_comm(tsk, "kthreadd"); 
 
       ignore_signals(tsk); 
 
       set_cpus_allowed_ptr(tsk, cpu_all_mask); 
 
       set_mems_allowed(node_states[N_MEMORY]); 
 
       current->flags |= PF_NOFREEZE; 
 
       for (;;) { 
 
           set_current_state(TASK_INTERRUPTIBLE); 
 
           if (list_empty(&kthread_create_list)) 
 
               schedule(); 
 
           __set_current_state(TASK_RUNNING); 
 
           spin_lock(&kthread_create_lock); 
 
           while (!list_empty(&kthread_create_list)) { 
 
               struct kthread_create_info *create; 
 
               create = list_entry(kthread_create_list.next, 
 
                           struct kthread_create_info, list); 
 
               list_del_init(&create->list); 
 
               spin_unlock(&kthread_create_lock); 
 
               create_kthread(create); 
 
               spin_lock(&kthread_create_lock); 
 
           } 
 
           spin_unlock(&kthread_create_lock); 
 
       } 
 
       return 0; 
 
   }

1.4 rcu_read_lock & rcu_read_unlock

定义在include/linux/rcupdate.h中

   static inline void rcu_read_lock(void) 
 
   { 
 
       __rcu_read_lock(); 
 
       __acquire(RCU); 
 
       rcu_lock_acquire(&rcu_lock_map); 
 
       rcu_lockdep_assert(!rcu_is_cpu_idle(), 
 
                  "rcu_read_lock() used illegally while idle"); 
 
   } 
 
   static inline void rcu_read_unlock(void) 
 
   { 
 
       rcu_lockdep_assert(!rcu_is_cpu_idle(), 
 
                  "rcu_read_unlock() used illegally while idle"); 
 
       rcu_lock_release(&rcu_lock_map); 
 
       __release(RCU); 
 
       __rcu_read_unlock(); 
 
   }

1.5 find_task_by_pid_ns

定义在kernel/pid.c中

   struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 
 
   { 
 
       rcu_lockdep_assert(rcu_read_lock_held(), 
 
                  "find_task_by_pid_ns() needs rcu_read_lock()" 
 
                  " protection"); 
 
       return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 
 
   } 
 
   struct pid_namespace init_pid_ns = { 
 
       .kref = { 
 
           .refcount       = ATOMIC_INIT(2), 
 
       }, 
 
       .pidmap = { 
 
           [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 
 
       }, 
 
       .last_pid = 0, 
 
       .level = 0, 
 
       .child_reaper = &init_task, 
 
       .user_ns = &init_user_ns, 
 
       .proc_inum = PROC_PID_INIT_INO, 
 
   };

1.6 sched_setscheduler_nocheck

定义在kernel/sched/core.c中

   int sched_setscheduler_nocheck(struct task_struct *p, int policy, 
 
                      const struct sched_param *param) 
 
   { 
 
       return __sched_setscheduler(p, policy, param, false); 
 
   }

1.7 kthreadd_done

定义在init/main.c中

   static __initdata DECLARE_COMPLETION(kthreadd_done); 
 
   #define DECLARE_COMPLETION(work) \ 
 
       struct completion work = COMPLETION_INITIALIZER(work)

1.8 init_idle_bootup_task

定义在kernel/sched/core.c中

   void __cpuinit init_idle_bootup_task(struct task_struct *idle) 
 
   { 
 
       idle->sched_class = &idle_sched_class; 
 
   } 
 
   #define get_current() (current_thread_info()->task) 
 
   #define current get_current()

1.9 schedule_preempt_disabled

   void __sched schedule_preempt_disabled(void) 
 
   { 
 
       sched_preempt_enable_no_resched(); // 内核抢占计数preempt_count减1，但不立即抢占式调度 
 
       schedule(); // 并主动请求调度，让出cpu，1号进程kernel_init将会运行 
 
       preempt_disable(); // 禁止抢占 
 
   } 
 
   #define sched_preempt_enable_no_resched() \ 
 
   do { \ 
 
       barrier(); \ 
 
       dec_preempt_count(); \ 
 
   } while (0) 
 
   #define preempt_disable() \ 
 
   do { \ 
 
       inc_preempt_count(); \ 
 
       barrier(); \ 
 
   } while (0)

1.10 cpu_startup_entry

定义在cpu_startup_entry中

   void cpu_startup_entry(enum cpuhp_state state) 
 
   { 
 
       /* 
 
        * This #ifdef needs to die, but it's too late in the cycle to 
 
        * make this generic (arm and sh have never invoked the canary 
 
        * init for the non boot cpus!). Will be fixed in 3.11 
 
        */ 
 
   #ifdef CONFIG_X86 
 
       /* 
 
        * If we're the non-boot CPU, nothing set the stack canary up 
 
        * for us. The boot CPU already has it initialized but no harm 
 
        * in doing it again. This is a good place for updating it, as 
 
        * we wont ever return from this function (so the invalid 
 
        * canaries already on the stack wont ever trigger). 
 
        */ 
 
       boot_init_stack_canary(); 
 
   #endif 
 
       __current_set_polling(); 
 
       arch_cpu_idle_prepare(); 
 
       cpu_idle_loop(); // 0号进程进入idle循环 
 
   }

二 kernel_init

2.0 kernel_init

定义在init/main.c中

static int __ref kernel_init(void *unused)

{

kernel_init_freeable(); // 重要，下面有详细说明

/* need to finish all async __init code before freeing the memory */

async_synchronize_full(); // 等待所有异步调用执行完成

free_initmem(); // 释放所有init.* 段中的内存

mark_rodata_ro(); // arm64为空

system_state = SYSTEM_RUNNING; // 设置系统状态为运行状态

numa_default_policy();

flush_delayed_fput(); // 同步所有延时fput

if (ramdisk_execute_command) {

if (!run_init_process(ramdisk_execute_command)) // do_execve(“/init”) // 运行init程序，从一个内核进程变成用户进程

return 0;

pr_err("Failed to execute %s\n", ramdisk_execute_command);

}

* We try each of these until one succeeds.

* The Bourne shell can be used instead of init if we are

* trying to recover a really broken machine.

if (execute_command) {

if (!run_init_process(execute_command))

return 0;

pr_err("Failed to execute %s. Attempting defaults...\n",

execute_command);

}

if (!run_init_process("/sbin/init") ||

!run_init_process("/etc/init") ||

!run_init_process("/bin/init") ||

!run_init_process("/bin/sh"))

return 0;

panic("No init found. Try passing init= option to kernel. "

"See Linux Documentation/init.txt for guidance.");

}

2.1 kernel_init_freeable

定义在init/main.c中

static noinline void __init kernel_init_freeable(void)

{

* Wait until kthreadd is all set-up.

wait_for_completion(&kthreadd_done); // 等待kthreadd_done完成量，其实是在等待kthreadd进程创建完成

/* Now the scheduler is fully set up and can do blocking allocations */

gfp_allowed_mask = __GFP_BITS_MASK; //

* init can allocate pages on any node

set_mems_allowed(node_states[N_MEMORY]); // 设置init进程可以分配的物理页面

* init can run on any cpu.

set_cpus_allowed_ptr(current, cpu_all_mask); // 通过设置cpu_bit_mask, 使init进程可以在任意cpu上运行

cad_pid = task_pid(current); //cad： ctrl-alt-del 设置init进程来处理 ctrl-alt-del信号

smp_prepare_cpus(setup_max_cpus); // 对全部可用cpu核调用cpu_prepare函数，并将其设为present状态

do_pre_smp_initcalls(); // 调用level小于0的initcall函数

lockup_detector_init(); // 使能watchdog

smp_init(); // 启动cpu0外的其他cpu核

sched_init_smp(); // 进程调度域初始化

do_basic_setup(); // 重要，下面有详细说明

/* Open the /dev/console on the rootfs, this should never fail */

if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) // 打开 /dev/console ，文件号0，作为init进程标准输入

pr_err("Warning: unable to open an initial console.\n");

(void) sys_dup(0); // 标准输入

(void) sys_dup(0); // 标准输出

* check if there is an early userspace init. If yes, let it do all

* the work

if (!ramdisk_execute_command)

ramdisk_execute_command = "/init”; // init程序

if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {

ramdisk_execute_command = NULL;

prepare_namespace();

}

* Ok, we have completed the initial bootup, and

* we're essentially up and running. Get rid of the

* initmem segments and start the user-mode stuff..

/* rootfs is available now, try loading default modules */

load_default_modules(); // 加载IO调度的电梯算法

}

static void __init do_basic_setup(void)

{

cpuset_init_smp(); // 初始化内核control group的cpuset子系统

usermodehelper_init(); // 创建khelper单线程工作队列，用于协助新建和运行用户空间程序

shmem_init(); // 初始化共享内存

driver_init(); // 初始化设备驱动

init_irq_proc(); // 创建/proc/irq目录, 并初始化系统中所有中断对应的子目录

do_ctors(); // 执行内核的构造函数

usermodehelper_enable(); // 使能usermodehelper

do_initcalls(); // 调用level 0到level 7的initcall函数，依次的level名称是"early", "core", "postcore", "arch", "subsys", "fs", "device", “late”，需要注意的kernel在这块的命名有些问题，early_initcall 对应的level小于0，pure_initcall对应level才是0

random_int_secret_init(); 初始化随机数生成池

}

void __init driver_init(void)

{

/* These are the core pieces */

devtmpfs_init(); // 注册devtmpfs文件系统，启动kdevtmpfs进程

devices_init(); // 初始化驱动模型中的部分子系统，kset： devices 和 kobject：dev、 dev/block、 dev/char

buses_init(); // 初始化驱动模型中的bus子系统，kset：bus、devices/system

classes_init(); // 初始化驱动模型中的class子系统，kset：class

firmware_init(); // 初始化驱动模型中的firmware子系统，kobject：firmware

hypervisor_init(); // 初始化驱动模型中的hypervisor子系统，kobject：hypervisor

/* These are also core pieces, but must come after the

* core core pieces.

platform_bus_init(); // 初始化驱动模型中的bus/platform子系统

cpu_dev_init(); // 初始化驱动模型中的devices/system/cpu子系统

memory_dev_init(); // 当前为空函数

container_dev_init(); // 初始化驱动模型中的devices/system/container子系统

}

2.2 free_initmem

定义在arch/arm64/mm/init.c中

   void free_initmem(void) 
 
   {    
 
       poison_init_mem(__init_begin, __init_end - __init_begin); 
 
       free_initmem_default(0); 
 
   }

2.3 run_init_process

定义在init/main.c中

   static int run_init_process(const char *init_filename) 
 
   {        
 
       argv_init[0] = init_filename; 
 
       return do_execve(init_filename, 
 
           (const char __user *const __user *)argv_init, 
 
           (const char __user *const __user *)envp_init); 
 
   } 
 
   static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; 
 
   const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };

秒客网

Kernel启动流程源码解析 7 rest_init()

相关文章