Kdump之kexec源码分析

时间:2022-05-01 16:50:53
知道kexec还是在linuxsir上看到一篇介绍其应用的帖子, 经常用kexec快速启动内核的步骤如下:
(1).kexec -l <kernel-image> --append="<command-line-options>" [--initrd=xxxxxxxxxxx一般是要的,不过某些情况下可选]
例如: kexec -l /boot/vmlinuz-2.6.31  --append="root=/dev/sda6 ro nomce vga=0x317" --initrd=xxxxxxxxxxx
(2).kexec -e
再或者用kexec -p

下面仅分析kexe -l  & -e的形式


int main(int argc, char *argv[])
{
    int do_load = 1;
    int do_exec = 0;
    int do_load_jump_back_helper = 0;
    int do_shutdown = 1;
    int do_sync = 1;
    int do_ifdown = 0;
    int do_unload = 0;
    int do_reuse_initrd = 0;
    void *entry = 0;
    char *type = 0;
    char *endptr;
    int opt;
    int result = 0;
    int fileind;
    static const struct option options[] = {
        KEXEC_ARCH_OPTIONS
        { 0, 0, 0, 0},
    };
    static const char short_options[] = KEXEC_OPT_STR;

    opterr = 0; /* Don't complain about unrecognized options here */
    while ((opt = getopt_long(argc, argv, short_options,
                 options, 0)) != -1) {
        switch(opt) {
        case OPT_HELP:
            usage();
            return 0;
        case OPT_VERSION:
            version();
            return 0;
        case OPT_NOIFDOWN:
            do_ifdown = 0;
            break;
        case OPT_FORCE:
            do_load = 1;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 1;
            break;
        case OPT_LOAD://here1:

            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            break;
        case OPT_UNLOAD:
            do_load = 0;
            do_shutdown = 0;
            do_sync = 0;
            do_unload = 1;
            break;
        case OPT_EXEC://here3:

            do_load = 0;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 1;
            break;
        case OPT_LOAD_JUMP_BACK_HELPER:
            do_load = 0;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 0;
            do_load_jump_back_helper = 1;
            kexec_flags = KEXEC_PRESERVE_CONTEXT;
            break;
        case OPT_ENTRY:
            entry = (void *)strtoul(optarg, &endptr, 0);
            if (*endptr) {
                fprintf(stderr,
                    "Bad option value in --load-jump-back-helper=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_LOAD_PRESERVE_CONTEXT:
            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            do_sync = 1;
            kexec_flags = KEXEC_PRESERVE_CONTEXT;
            break;
        case OPT_TYPE:
            type = optarg;
            break;
        case OPT_PANIC://here2:

            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            do_sync = 0;
            kexec_flags = KEXEC_ON_CRASH;
            break;
        case OPT_MEM_MIN:
            mem_min = strtoul(optarg, &endptr, 0);
            if (*endptr) {
                fprintf(stderr,
                    "Bad option value in --mem-min=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_MEM_MAX:
            mem_max = strtoul(optarg, &endptr, 0);
            if (*endptr) {
                fprintf(stderr,
                    "Bad option value in --mem-max=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_REUSE_INITRD:
            do_reuse_initrd = 1;
            break;
        default:
            break;
        }
    }

    if ((kexec_flags & KEXEC_ON_CRASH) && !is_crashkernel_mem_reserved()) {
        printf("Memory for crashkernel is not reserved\n");
        printf("Please reserve memory by passing ");
        printf("\"crashkernel=X@Y\" parameter to the kernel\n");
        die("Then try loading kdump kernel\n");
    }

    if (do_load && (kexec_flags & KEXEC_PRESERVE_CONTEXT) &&
     mem_max == ULONG_MAX) {
        printf("Please specify memory range used by kexeced kernel\n");
        printf("to preserve the context of original kernel with \n");
        die("\"--mem-max\" parameter\n");
    }

    fileind = optind;
    /* Reset getopt for the next pass; called in other source modules */
    opterr = 1;
    optind = 1;

    result = arch_process_options(argc, argv);//先进来,一般情况下result返回为0


    /* Check for bogus options */
    if (!do_load) {
        while((opt = getopt_long(argc, argv, short_options,
                     options, 0)) != -1) {
            if ((opt == '?') || (opt >= OPT_ARCH_MAX)) {
                usage();
                return 1;
            }
        }
    }

    if (do_reuse_initrd){
        check_reuse_initrd();
        arch_reuse_initrd();
    }

    if (do_unload) {
        result = k_unload(kexec_flags);
    }
    if (do_load && (result == 0)) {//here1,here2:

        result = my_load(type, fileind, argc, argv, kexec_flags, entry);
//====>trap into
    }
    /* Don't shutdown unless there is something to reboot to! */
    if ((result == 0) && (do_shutdown || do_exec) && !kexec_loaded()) {
        die("Nothing has been loaded!\n");
    }
    if ((result == 0) && do_shutdown) {
        result = my_shutdown();
    }
    if ((result == 0) && do_sync) {
        sync();
    }
    if ((result == 0) && do_ifdown) {
        ifdown();
    }
    if ((result == 0) && do_exec) {//here3:

        result = my_exec();//=================>trap into

    }
    if ((result == 0) && do_load_jump_back_helper) {
        result = my_load_jump_back_helper(kexec_flags, entry);
    }

    fflush(stdout);
    fflush(stderr);
    return result;
}





/*
 *    Load the new kernel
 */

static int my_load(const char *type, int fileind, int argc, char **argv,
         unsigned long kexec_flags, void *entry)//******************

{
    char *kernel;
    char *kernel_buf;
    off_t kernel_size;
    int i = 0;
    int result;
    struct kexec_info info;//1.注意这个结构体
    long native_arch;
    int guess_only = 0;

    memset(&info, 0, sizeof(info));
    info.segment = NULL;
    info.nr_segments = 0;
    info.entry = NULL;
    info.backup_start = 0;
    info.kexec_flags = kexec_flags;

    result = 0;
    if (argc - fileind <= 0) {
        fprintf(stderr, "No kernel specified\n");
        usage();
        return -1;
    }
    kernel = argv[fileind];
    /* slurp in the input kernel */
    kernel_buf = slurp_decompress_file(kernel, &kernel_size);//kernel_buf中含有解压缩的内核

#if 0
    fprintf(stderr, "kernel: %p kernel_size: %lx\n",
        kernel_buf, kernel_size);
#endif

    if (get_memory_ranges(&info.memory_range, &info.memory_ranges,
        info.kexec_flags) < 0) {
        fprintf(stderr, "Could not get memory layout\n");
        return -1;
    }
    /* if a kernel type was specified, try to honor it */
    if (type) {
        for (i = 0; i < file_types; i++) {
            if (strcmp(type, file_type[i].name) == 0)
                break;
        }
        if (i == file_types) {
            fprintf(stderr, "Unsupported kernel type %s\n", type);
            return -1;
        } else {
            /* make sure our file is really of that type */


   /*

  此处用到的变量file_type的结构体
struct file_type {
    const char *name;
    probe_t *probe;
    load_t  *load;
    usage_t *usage;
};//类似指定回调函数

struct file_type file_type[] = {
    { "multiboot-x86", multiboot_x86_probe, multiboot_x86_load,
      multiboot_x86_usage },
    { "elf-x86", elf_x86_probe, elf_x86_load, elf_x86_usage },
    { "bzImage", bzImage_probe, bzImage_load, bzImage_usage },
    { "beoboot-x86", beoboot_probe, beoboot_load, beoboot_usage },
    { "nbi-x86", nbi_probe, nbi_load, nbi_usage },
};

   */

            if (file_type[i].probe(kernel_buf, kernel_size) < 0)
                guess_only = 1;
        }
    }
    if (!type || guess_only) {
        for (i = 0; i < file_types; i++) {
            if (file_type[i].probe(kernel_buf, kernel_size) >= 0)
                break;
        }
        if (i == file_types) {
            fprintf(stderr, "Cannot determine the file type "
                    "of %s\n", kernel);
            return -1;
        } else {
            if (guess_only) {
                fprintf(stderr, "Wrong file type %s, "
                    "file matches type %s\n",
                    type, file_type[i].name);
                return -1;
            }
        }
    }
    if (file_type[i].load(argc, argv, kernel_buf,
             kernel_size, &info) < 0) {//===========>trap into elf_x86_load

        fprintf(stderr, "Cannot load %s\n", kernel);
        return -1;
    }
    /* If we are not in native mode setup an appropriate trampoline */
    native_arch = physical_arch();
    if (native_arch < 0) {
        return -1;
    }
    info.kexec_flags |= native_arch;
    if (arch_compat_trampoline(&info) < 0) {
        return -1;
    }
    if (info.kexec_flags & KEXEC_PRESERVE_CONTEXT) {
        add_backup_segments(&info, mem_min, mem_max - mem_min + 1);
    }
    /* Verify all of the segments load to a valid location in memory */
    for (i = 0; i < info.nr_segments; i++) {
        if (!valid_memory_segment(&info, info.segment +i)) {
            fprintf(stderr, "Invalid memory segment %p - %p\n",
                info.segment[i].mem,
                ((char *)info.segment[i].mem) +
                info.segment[i].memsz);
            return -1;
        }
    }
    /* Sort the segments and verify we don't have overlaps */
    if (sort_segments(&info) < 0) {
        return -1;
    }
    /* if purgatory is loaded update it */
    update_purgatory(&info);//这个对新内核做下hash来进行完整性检测,,并将前640k保留起来(没仔细看,根据kdump相关资料猜得,可能有误,以后确认)

    if (entry)
        info.entry = entry;
#if 0
    fprintf(stderr, "kexec_load: entry = %p flags = %lx\n",
        info.entry, info.kexec_flags);
    print_segments(stderr, &info);
#endif
    result = kexec_load(
        info.entry, info.nr_segments, info.segment, info.kexec_flags);//====================>

    if (result != 0) {
        /* The load failed, print some debugging information */
        fprintf(stderr, "kexec_load failed: %s\n",
            strerror(errno));
        fprintf(stderr, "entry = %p flags = %lx\n",
            info.entry, info.kexec_flags);
        print_segments(stderr, &info);
    }
    return result;
}




int elf_x86_load(int argc, char **argv, const char *buf, off_t len,
    struct kexec_info *info)//******************

{
    struct mem_ehdr ehdr;
    const char *command_line;
    char *modified_cmdline;
    int command_line_len;
    int modified_cmdline_len;
    const char *ramdisk;
    unsigned long entry, max_addr;
    int arg_style;
#define ARG_STYLE_ELF 0
#define ARG_STYLE_LINUX 1
#define ARG_STYLE_NONE 2
    int opt;
#define OPT_APPEND        (OPT_ARCH_MAX+0)
#define OPT_REUSE_CMDLINE    (OPT_ARCH_MAX+1)
#define OPT_RAMDISK        (OPT_ARCH_MAX+2)
#define OPT_ARGS_ELF     (OPT_ARCH_MAX+3)
#define OPT_ARGS_LINUX     (OPT_ARCH_MAX+4)
#define OPT_ARGS_NONE     (OPT_ARCH_MAX+5)

    static const struct option options[] = {
        KEXEC_ARCH_OPTIONS
        { "command-line",    1, NULL, OPT_APPEND },
        { "append",        1, NULL, OPT_APPEND },
        { "reuse-cmdline",    1, NULL, OPT_REUSE_CMDLINE },
        { "initrd",        1, NULL, OPT_RAMDISK },
        { "ramdisk",        1, NULL, OPT_RAMDISK },
        { "args-elf",        0, NULL, OPT_ARGS_ELF },
        { "args-linux",        0, NULL, OPT_ARGS_LINUX },
        { "args-none",        0, NULL, OPT_ARGS_NONE },
        { 0,             0, NULL, 0 },
    };

    static const char short_options[] = KEXEC_OPT_STR "";

    /*
     * Parse the command line arguments
     */

    arg_style = ARG_STYLE_ELF;
    command_line = 0;
    modified_cmdline = 0;
    modified_cmdline_len = 0;
    ramdisk = 0;
    while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
        switch(opt) {
        default:
            /* Ignore core options */
            if (opt < OPT_ARCH_MAX) {
                break;
            }
        case '?':
            usage();
            return -1;
        case OPT_APPEND:
            command_line = optarg;
            break;
        case OPT_REUSE_CMDLINE:
            command_line = get_command_line();
            break;
        case OPT_RAMDISK:
            ramdisk = optarg;
            break;
        case OPT_ARGS_ELF:
            arg_style = ARG_STYLE_ELF;
            break;
        case OPT_ARGS_LINUX:
            arg_style = ARG_STYLE_LINUX;
            break;
        case OPT_ARGS_NONE:
#ifdef __i386__
            arg_style = ARG_STYLE_NONE;
#else
            die("--args-none only works on arch i386\n");
#endif
            break;
        }
    }
    command_line_len = 0;
    if (command_line) {
        command_line_len = strlen(command_line) +1;
    }

    /* Need to append some command line parameters internally in case of
     * taking crash dumps.
     */

    if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {
        modified_cmdline = xmalloc(COMMAND_LINE_SIZE);
        memset((void *)modified_cmdline, 0, COMMAND_LINE_SIZE);
        if (command_line) {
            strncpy(modified_cmdline, command_line,
                        COMMAND_LINE_SIZE);
            modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
        }
        modified_cmdline_len = strlen(modified_cmdline);
    }

    /* Load the ELF executable 加载新内核,解析elf头并加载elf data,最终就是将内核各segmants(内核本质上就是一个ELF文件)弄到info中*/*/
    elf_exec_build_load(info, &ehdr, buf, len, 0);//========================>


    entry = ehdr.e_entry;
    max_addr = elf_max_addr(&ehdr);

    /* Do we want arguments? */
    if (arg_style != ARG_STYLE_NONE) {
        /* Load the setup code */
        elf_rel_build_load(info, &info->rhdr, (char *) purgatory, purgatory_size,
            0, ULONG_MAX, 1, 0);
    }
    if (arg_style == ARG_STYLE_NONE) {
        info->entry = (void *)entry;

    }
    else if (arg_style == ARG_STYLE_ELF) {
        unsigned long note_base;
        struct entry32_regs regs;
        uint32_t arg1, arg2;

        /* Setup the ELF boot notes */
        note_base = elf_boot_notes(info, max_addr,
            (unsigned char *) command_line, command_line_len);

        /* Initialize the stack arguments */
        arg2 = 0; /* No return address */
        arg1 = note_base;
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_1", &arg1, sizeof(arg1));
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_2", &arg2, sizeof(arg2));
        
        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.eip = entry; /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_arg32_2");
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));

        if (ramdisk) {
            die("Ramdisks not supported with generic elf arguments");
        }
    }
    else if (arg_style == ARG_STYLE_LINUX) {
        struct x86_linux_faked_param_header *hdr;
        unsigned long param_base;
        const unsigned char *ramdisk_buf;
        off_t ramdisk_length;
        struct entry32_regs regs;
        int rc = 0;

        /* Get the linux parameter header */
        hdr = xmalloc(sizeof(*hdr));

        /* Hack: With some ld versions, vmlinux program headers show
         * a gap of two pages between bss segment and data segment
         * but effectively kernel considers it as bss segment and
         * overwrites the any data placed there. Hence bloat the
         * memsz of parameter segment to 16K to avoid being placed
         * in such gaps.
         * This is a makeshift solution until it is fixed in kernel
         */

        param_base = add_buffer(info, hdr, sizeof(*hdr), 16*1024,
            16, 0, max_addr, 1);

        /* Initialize the parameter header */
        memset(hdr, 0, sizeof(*hdr));
        init_linux_parameters(&hdr->hdr);

        /* Add a ramdisk to the current image */
        ramdisk_buf = NULL;
        ramdisk_length = 0;
        if (ramdisk) {
            ramdisk_buf = (unsigned char *) slurp_file(ramdisk, &ramdisk_length);
        }

        /* If panic kernel is being loaded, additional segments need
         * to be created. */

        if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {
            rc = load_crashdump_segments(info, modified_cmdline,
                        max_addr, 0);
            if (rc < 0)
                return -1;
            /* Use new command line. */
            command_line = modified_cmdline;
            command_line_len = strlen(modified_cmdline) + 1;
        }

        /* Tell the kernel what is going on */
        setup_linux_bootloader_parameters(info, &hdr->hdr, param_base,
            offsetof(struct x86_linux_faked_param_header, command_line),
            command_line, command_line_len,
            ramdisk_buf, ramdisk_length);

        /* Fill in the information bios calls would usually provide */
        setup_linux_system_parameters(&hdr->hdr, info->kexec_flags);

        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.ebx = 0;        /* Bootstrap processor */
        regs.esi = param_base;    /* Pointer to the parameters */
        regs.eip = entry;    /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_end"); /* Stack, unused */
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
    }
    else {
        die("Unknown argument style\n");
    }
    return 0;
}





static inline long kexec_load(void *entry, unsigned long nr_segments,
            struct kexec_segment *segments, unsigned long flags)//******************

{

  //最终调用sys_kexec_load陷入内核

    return (long) syscall(__NR_kexec_load, entry, nr_segments, segments, flags);
}



看以下kexe -e执行的操作:


/*
 * Exec the new kernel (reboot)
 */

static int my_exec(void)
{
    int result;

    result = kexec_reboot();
    /* I have failed if I make it here */
    fprintf(stderr, "kexec failed: %s\n",
        strerror(errno));
    return -1;
}

static inline long kexec_reboot(void)
{

  //最终调用sys_reboot,注意红色标志
    return (long) syscall(__NR_reboot, LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_KEXEC, 0);
}


+++++++++++++++++++++++trap  into kernel,我是分隔线+++++++++++++++++++++++++++++


asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                struct kexec_segment __user *segments,
                unsigned long flags)
{
    struct kimage **dest_image, *image;
    int locked;
    int result;

    /* We only trust the superuser with rebooting the system. */
    if (!capable(CAP_SYS_BOOT))
        return -EPERM;

    /*
     * Verify we have a legal set of flags
     * This leaves us room for future extensions.
     */

    if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
        return -EINVAL;

    /* Verify we are on the appropriate architecture */
    if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
        ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
        return -EINVAL;

    /* Put an artificial cap on the number
     * of segments passed to kexec_load.
     */

    if (nr_segments > KEXEC_SEGMENT_MAX)
        return -EINVAL;

    image = NULL;
    result = 0;

    /* Because we write directly to the reserved memory
     * region when loading crash kernels we need a mutex here to
     * prevent multiple crash kernels from attempting to load
     * simultaneously, and to prevent a crash kernel from loading
     * over the top of a in use crash kernel.
     *
     * KISS: always take the mutex.
     */

    locked = xchg(&kexec_lock, 1);
    if (locked)
        return -EBUSY;

    dest_image = &kexec_image;//选此个
    if (flags & KEXEC_ON_CRASH)
        dest_image = &kexec_crash_image;//kdump时
    if (nr_segments > 0) {
        unsigned long i;

        /* Loading another kernel to reboot into */
        if ((flags & KEXEC_ON_CRASH) == 0)//===========>kexec标志与后进入此

            result = kimage_normal_alloc(&image, entry,
                            nr_segments, segments);// 1.分配kimage和一个控制页空间,并拷贝各个段到内核空间

        /* Loading another kernel to switch to if this one crashes */
        else if (flags & KEXEC_ON_CRASH) {//kdump相关

            /* Free any current crash dump kernel before
             * we corrupt it.
             */

            kimage_free(xchg(&kexec_crash_image, NULL));
            result = kimage_crash_alloc(&image, entry,
                         nr_segments, segments);
        }
        if (result)
            goto out;

        result = machine_kexec_prepare(image);//x86为空

        if (result)
            goto out;

        for (i = 0; i < nr_segments; i++) {
            result = kimage_load_segment(image, &image->segment[i]);//加载各个段?????1.处不是做过了么,不太明白

            if (result)
                goto out;
        }
        result = kimage_terminate(image);
        if (result)
            goto out;
    }
    /* Install the new kernel, and Uninstall the old */
    image = xchg(dest_image, image);//应该是交换两个数据,将image的数据放到dest_image上?????

out:
    locked = xchg(&kexec_lock, 0); /* Release the mutex */
    BUG_ON(!locked);
    kimage_free(image);

    return result;
}




static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                unsigned long nr_segments,
                struct kexec_segment __user *segments)
{
    int result;
    struct kimage *image;

    /* Allocate and initialize a controlling structure */
    image = NULL;
    result = do_kimage_alloc(&image, entry, nr_segments, segments);//分配一个kimage大小的空间

    if (result)
        goto out;

    *rimage = image;

    /*
     * Find a location for the control code buffer, and add it
     * the vector of segments so that it's pages will also be
     * counted as destination pages.
     */

    result = -ENOMEM;
    image->control_code_page = kimage_alloc_control_pages(image,
                     get_order(KEXEC_CONTROL_CODE_SIZE));//分配控制页

    if (!image->control_code_page) {
        printk(KERN_ERR "Could not allocate control_code_buffer\n");
        goto out;
    }

    result = 0;
 out:
    if (result == 0)
        *rimage = image;
    else
        kfree(image);

    return result;
}


简单小结下此阶段作的工作:
     就是将kexec -l命令行的新内核先拷贝到kernel_buf,然后再加载到结构体kexec_info info中,最后通过sys_kexec_load系统调用转移到动态内核内存页,这个系统调用给每个从用户空间传递而来的段分配动态内核页,并将段拷贝到这些内核页上



/*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
 * so that some mistake won't make this reboot the whole machine.
 * You can also set the meaning of the ctrl-alt-del-key here.
 *
 * reboot doesn't sync: do that yourself before calling this.
 */

asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)// 0.kump相关=====>

{
    char buffer[256];

    /* We only trust the superuser with rebooting the system. */
    if (!capable(CAP_SYS_BOOT))
        return -EPERM;

    /* For safety, we require "magic" arguments. */
    if (magic1 != LINUX_REBOOT_MAGIC1 ||
     (magic2 != LINUX_REBOOT_MAGIC2 &&
     magic2 != LINUX_REBOOT_MAGIC2A &&
            magic2 != LINUX_REBOOT_MAGIC2B &&
     magic2 != LINUX_REBOOT_MAGIC2C))
        return -EINVAL;

    /* Instead of trying to make the power_off code look like
     * halt when pm_power_off is not set do it the easy way.
     */

    if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
        cmd = LINUX_REBOOT_CMD_HALT;

    lock_kernel();
    switch (cmd) {
    case LINUX_REBOOT_CMD_RESTART:
        kernel_restart(NULL);
        break;

    case LINUX_REBOOT_CMD_CAD_ON:
        C_A_D = 1;
        break;

    case LINUX_REBOOT_CMD_CAD_OFF:
        C_A_D = 0;
        break;

    case LINUX_REBOOT_CMD_HALT:
        kernel_halt();
        unlock_kernel();
        do_exit(0);
        break;

    case LINUX_REBOOT_CMD_POWER_OFF:
        kernel_power_off();
        unlock_kernel();
        do_exit(0);
        break;

    case LINUX_REBOOT_CMD_RESTART2:
        if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
            unlock_kernel();
            return -EFAULT;
        }
        buffer[sizeof(buffer) - 1] = '\0';

        kernel_restart(buffer);
        break;

    case LINUX_REBOOT_CMD_KEXEC://还记得此标志否

        kernel_kexec();//=========>

        unlock_kernel();
        return -EINVAL;

#ifdef CONFIG_HIBERNATION
    case LINUX_REBOOT_CMD_SW_SUSPEND:
        {
            int ret = hibernate();
            unlock_kernel();
            return ret;
        }
#endif

    default:
        unlock_kernel();
        return -EINVAL;
    }
    unlock_kernel();
    return 0;
}



/**
 *    kernel_kexec - reboot the system
 *
 *    Move into place and start executing a preloaded standalone
 *    executable. If nothing was preloaded return an error.
 */

static void kernel_kexec(void)// 1. kdump相关

{
#ifdef CONFIG_KEXEC
    struct kimage *image;
    image = xchg(&kexec_image, NULL);
    if (!image)
        return;
    kernel_restart_prepare(NULL);
    printk(KERN_EMERG "Starting new kernel\n");
    machine_shutdown();
    machine_kexec(image);//===========>

#endif
}





/*
 * Do not allocate memory (or fail in any way) in machine_kexec().
 * We are past the point of no return, committed to rebooting now.
 */

NORET_TYPE void machine_kexec(struct kimage *image)// 2. kdump相关==========

{
    unsigned long page_list[PAGES_NR];
    void *control_page;

    /* Interrupts aren't acceptable while we reboot */
    local_irq_disable();

    control_page = page_address(image->control_code_page);//kexec_crash_image是kexec -p时定义的?????

    memcpy(control_page, relocate_kernel, PAGE_SIZE);

    page_list[PA_CONTROL_PAGE] = __pa(control_page);//控制页

    page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
    page_list[PA_PGD] = __pa(kexec_pgd);
    page_list[VA_PGD] = (unsigned long)kexec_pgd;
#ifdef CONFIG_X86_PAE
    page_list[PA_PMD_0] = __pa(kexec_pmd0);
    page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
    page_list[PA_PMD_1] = __pa(kexec_pmd1);
    page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
#endif
    page_list[PA_PTE_0] = __pa(kexec_pte0);//两个页表kexec_pte0和kexec_pte1

    page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
    page_list[PA_PTE_1] = __pa(kexec_pte1);
    page_list[VA_PTE_1] = (unsigned long)kexec_pte1;

    /* The segment registers are funny things, they have both a
     * visible and an invisible part. Whenever the visible part is
     * set to a specific selector, the invisible part is loaded
     * with from a table in memory. At no other time is the
     * descriptor table in memory accessed.
     *
     * I take advantage of this here by force loading the
     * segments, before I zap the gdt with an invalid value.
     */

    load_segments();//将内核数据段( __KERNEL_DS )代码段( __KERNEL_CS )值装载到段寄存器

    /* The gdt & idt are now invalid.
     * If you want to load them you must set up your own idt & gdt.
     */

    set_gdt(phys_to_virt(0),0);//并使用 GDT 和 IDT 无效

    set_idt(phys_to_virt(0),0);

    /* now call it */
    relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
            image->start, cpu_has_pae);//位于/arch/i386/kernel/relocate_kernel.s中============>trap into===========

}