Kdump之kexec源码分析

时间:2022-05-01 16:50:59
知道kexec还是在linuxsir上看到一篇介绍其应用的帖子, 经常用kexec快速启动内核的步骤如下:
(1).kexec -l <kernel-image> --append="<command-line-options>" [--initrd=xxxxxxxxxxx一般是要的,不过某些情况下可选]
例如: kexec -l /boot/vmlinuz-2.6.31  --append="root=/dev/sda6 ro nomce vga=0x317" --initrd=xxxxxxxxxxx
(2).kexec -e
再或者用kexec -p

下面仅分析kexe -l  & -e的形式


int main(int argc,
char *argv[])
{
    int do_load = 1;
    int do_exec = 0;
    int do_load_jump_back_helper
=
0;
    int do_shutdown
=
1;
    int do_sync = 1;
    int do_ifdown
=
0;
    int do_unload
=
0;
    int do_reuse_initrd
=
0;
    void *entry
= 0;
    char *type
= 0;
    char *endptr;
    int opt;
    int result = 0;
    int fileind;
    static const
struct option options[]
= {
        KEXEC_ARCH_OPTIONS
        { 0, 0, 0, 0},
    };
    static const
char short_options[]
= KEXEC_OPT_STR;

    opterr = 0;
/* Don't complain about unrecognized options here */
    while ((opt
= getopt_long(argc, argv, short_options,
                 options, 0))
!=
-
1) {
        switch(opt)
{
        case OPT_HELP:
            usage();
            return 0;
        case OPT_VERSION:
            version();
            return 0;
        case OPT_NOIFDOWN:
            do_ifdown = 0;
            break;
        case OPT_FORCE:
            do_load = 1;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 1;
            break;
        case OPT_LOAD://here1:

            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            break;
        case OPT_UNLOAD:
            do_load = 0;
            do_shutdown = 0;
            do_sync = 0;
            do_unload = 1;
            break;
        case OPT_EXEC://here3:

            do_load = 0;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 1;
            break;
        case OPT_LOAD_JUMP_BACK_HELPER:
            do_load = 0;
            do_shutdown = 0;
            do_sync = 1;
            do_ifdown = 1;
            do_exec = 0;
            do_load_jump_back_helper = 1;
            kexec_flags = KEXEC_PRESERVE_CONTEXT;
            break;
        case OPT_ENTRY:
            entry =
(
void *)strtoul(optarg,
&endptr, 0);
            if (*endptr)
{
                fprintf(stderr,
                    "Bad option value in --load-jump-back-helper=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_LOAD_PRESERVE_CONTEXT:
            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            do_sync = 1;
            kexec_flags = KEXEC_PRESERVE_CONTEXT;
            break;
        case OPT_TYPE:
            type = optarg;
            break;
        case OPT_PANIC://here2:

            do_load = 1;
            do_exec = 0;
            do_shutdown = 0;
            do_sync = 0;
            kexec_flags = KEXEC_ON_CRASH;
            break;
        case OPT_MEM_MIN:
            mem_min =
strtoul
(optarg,
&endptr, 0);
            if (*endptr)
{
                fprintf(stderr,
                    "Bad option value in --mem-min=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_MEM_MAX:
            mem_max =
strtoul
(optarg,
&endptr, 0);
            if (*endptr)
{
                fprintf(stderr,
                    "Bad option value in --mem-max=%s\n",
                    optarg);
                usage();
                return 1;
            }
            break;
        case OPT_REUSE_INITRD:
            do_reuse_initrd = 1;
            break;
        default:
            break;
        }
    }

    if ((kexec_flags
& KEXEC_ON_CRASH)
&&
!
is_crashkernel_mem_reserved())
{
        printf("Memory for crashkernel is not reserved\n");
        printf("Please reserve memory by passing ");
        printf("\"crashkernel=X@Y\" parameter to the kernel\n");
        die("Then try loading kdump kernel\n");
    }

    if (do_load
&&
(
kexec_flags & KEXEC_PRESERVE_CONTEXT)
&&
     mem_max ==
ULONG_MAX)
{

        printf("Please specify memory range used by kexeced kernel\n");
        printf("to preserve the context of original kernel with \n");
        die("\"--mem-max\" parameter\n");
    }

    fileind = optind;
    /* Reset getopt for the next pass; called in other source modules */
    opterr = 1;
    optind = 1;

    result = arch_process_options(argc, argv);//先进来,一般情况下result返回为0


    /* Check for bogus options */
    if (!do_load)
{
        while((opt
= getopt_long(argc, argv, short_options,
                     options, 0))
!=
-
1) {
            if ((opt
==
'?'
) ||
(opt >= OPT_ARCH_MAX))
{
                usage();
                return 1;
            }
        }
    }

    if (do_reuse_initrd){
        check_reuse_initrd();
        arch_reuse_initrd();
    }

    if (do_unload)
{
        result = k_unload(kexec_flags);
    }
    if (do_load
&&
(
result == 0))
{//here1,here2:

        result = my_load(type, fileind, argc, argv, kexec_flags, entry);
//====>trap into
    }
    /* Don't shutdown unless there is something to reboot to! */
    if ((result== 0)&& (do_shutdown || do_exec)&& !kexec_loaded()){
        die("Nothing has been loaded!\n");
    }
    if ((result== 0)&& do_shutdown){
        result = my_shutdown();
    }
    if ((result== 0)&& do_sync){
        sync();
    }
    if ((result== 0)&& do_ifdown){
        ifdown();
    }
    if ((result== 0)&& do_exec){//here3:

        result = my_exec();//=================>trap into

    }
    if ((result== 0)&& do_load_jump_back_helper){
        result = my_load_jump_back_helper(kexec_flags, entry);
    }

    fflush(stdout);
    fflush(stderr);
    return result;
}





/*
 *    Load the new kernel
 */

static int my_load(const
char *type,
int fileind,
int argc,
char
**argv,
         unsigned
long
kexec_flags,
void
*entry)//******************

{
    char *kernel;
    char *kernel_buf;
    off_t kernel_size;
    int i = 0;
    int result;
    struct kexec_info info;//1.注意这个结构体
    long native_arch;
    int guess_only
=
0;

    memset(&info, 0,
sizeof(info));
    info.segment =
NULL;
    info.nr_segments
=
0;
    info.entry =
NULL;
    info.backup_start
=
0;
    info.kexec_flags
=
kexec_flags;

    result = 0;
    if (argc
- fileind <= 0)
{
        fprintf(stderr,
"No kernel specified\n");
        usage();
        return -1;
    }
    kernel = argv[fileind];
    /* slurp in the input kernel */
    kernel_buf = slurp_decompress_file(kernel,
&kernel_size);//kernel_buf中含有解压缩的内核

#if 0
    fprintf(stderr,
"kernel: %p kernel_size: %lx\n",

        kernel_buf, kernel_size);
#endif

    if (get_memory_ranges(&info.memory_range,
&info.memory_ranges,
        info.kexec_flags)
< 0)
{

        fprintf(stderr,
"Could not get memory layout\n");
        return -1;
    }
    /* if a kernel type was specified, try to honor it */
    if (type)
{
        for (i
= 0; i
<
file_types; i++)
{
            if (strcmp(type, file_type[i].name)
== 0)
                break;
        }
        if (i
== file_types)
{
            fprintf(stderr,
"Unsupported kernel type %s\n", type);
            return
-
1;
        } else
{
            /* make sure our file is really of that type */


   /*

  此处用到的变量file_type的结构体
struct file_type {
    const char *name;
    probe_t *probe;
    load_t  *load;
    usage_t *usage;
};//类似指定回调函数

struct file_type file_type[] = {
    { "multiboot-x86", multiboot_x86_probe, multiboot_x86_load,
      multiboot_x86_usage },
    { "elf-x86", elf_x86_probe, elf_x86_load, elf_x86_usage },
    { "bzImage", bzImage_probe, bzImage_load, bzImage_usage },
    { "beoboot-x86", beoboot_probe, beoboot_load, beoboot_usage },
    { "nbi-x86", nbi_probe, nbi_load, nbi_usage },
};

   */

            if (file_type[i].probe(kernel_buf,
kernel_size) < 0)
                guess_only = 1;
        }
    }
    if (!type
|| guess_only)
{
        for (i
= 0; i
<
file_types; i++)
{
            if (file_type[i].probe(kernel_buf,
kernel_size) >= 0)
                break;
        }
        if (i
== file_types)
{
            fprintf(stderr,
"Cannot determine the file type "
                    "of %s\n", kernel);
            return
-
1;
        } else
{
            if (guess_only)
{
                fprintf(stderr,
"Wrong file type %s, "
                    "file matches type %s\n",
                    type, file_type[i].name);
                return
-
1;
            }
        }
    }
    if (file_type[i].load(argc,
argv, kernel_buf,
             kernel_size,
&
info) < 0)
{//===========>trap into elf_x86_load

        fprintf(stderr,
"Cannot load %s\n", kernel);
        return -1;
    }
    /* If we are not in native mode setup an appropriate trampoline */
    native_arch = physical_arch();
    if (native_arch
< 0)
{

        return -1;
    }
    info.kexec_flags
|
= native_arch;
    if (arch_compat_trampoline(&info)
< 0)
{

        return -1;
    }
    if (info.kexec_flags
& KEXEC_PRESERVE_CONTEXT)
{
        add_backup_segments(&info, mem_min, mem_max
- mem_min + 1);
    }
    /* Verify all of the segments load to a valid location in memory */
    for (i
=
0; i < info.nr_segments; i++)
{
        if (!valid_memory_segment(&info, info.segment
+i))
{
            fprintf(stderr,
"Invalid memory segment %p - %p\n",
                info.segment[i].mem,
                ((char
*)info.segment[i].mem)
+
                info.segment[i].memsz);
            return
-
1;
        }
    }
    /* Sort the segments and verify we don't have overlaps */
    if (sort_segments(&info)
< 0)
{

        return -1;
    }
    /* if purgatory is loaded update it */
    update_purgatory(&info);//这个对新内核做下hash来进行完整性检测,,并将前640k保留起来(没仔细看,根据kdump相关资料猜得,可能有误,以后确认)

    if (entry)
        info.entry
=
entry;
#if 0
    fprintf(stderr,
"kexec_load: entry = %p flags = %lx\n",

        info.entry, info.kexec_flags);
    print_segments(stderr,
&info);
#endif
    result = kexec_load(
        info.entry, info.nr_segments, info.segment,
info.kexec_flags);//====================>

    if (result
!= 0)
{
        /* The load failed, print some debugging information */
        fprintf(stderr,
"kexec_load failed: %s\n",

            strerror(errno));
        fprintf(stderr,
"entry = %p flags = %lx\n",

            info.entry, info.kexec_flags);
        print_segments(stderr,
&info);
    }
    return result;
}




int elf_x86_load(int argc,
char **argv,
const char
*
buf, off_t len,

    struct kexec_info
*
info)//******************

{
    struct mem_ehdr ehdr;
    const char
*command_line;
    char *modified_cmdline;
    int command_line_len;
    int modified_cmdline_len;
    const char
*ramdisk;
    unsigned long entry, max_addr;
    int arg_style;
#define ARG_STYLE_ELF 0
#define ARG_STYLE_LINUX 1
#define ARG_STYLE_NONE 2
    int opt;
#define OPT_APPEND        (OPT_ARCH_MAX+0)
#define OPT_REUSE_CMDLINE    (OPT_ARCH_MAX+1)
#define OPT_RAMDISK        (OPT_ARCH_MAX+2)
#define OPT_ARGS_ELF     (OPT_ARCH_MAX+3)
#define OPT_ARGS_LINUX     (OPT_ARCH_MAX+4)
#define OPT_ARGS_NONE     (OPT_ARCH_MAX+5)

    static const
struct option options[]
= {
        KEXEC_ARCH_OPTIONS
        { "command-line",    1,
NULL, OPT_APPEND
},
        { "append",        1,
NULL, OPT_APPEND
},
        { "reuse-cmdline",    1,
NULL, OPT_REUSE_CMDLINE
},
        { "initrd",        1,
NULL, OPT_RAMDISK
},
        { "ramdisk",        1,
NULL, OPT_RAMDISK
},
        { "args-elf",        0,
NULL, OPT_ARGS_ELF
},
        { "args-linux",        0,
NULL, OPT_ARGS_LINUX
},
        { "args-none",        0,
NULL, OPT_ARGS_NONE
},
        { 0,             0,
NULL, 0
}
,
    };

    static const
char short_options[]
= KEXEC_OPT_STR "";

    /*
     * Parse the command line arguments
     */

    arg_style = ARG_STYLE_ELF;
    command_line = 0;
    modified_cmdline = 0;
    modified_cmdline_len = 0;
    ramdisk = 0;
    while((opt
= getopt_long(argc, argv, short_options, options,
0))
!
= -1)
{
        switch(opt)
{
        default:
            /* Ignore core options */
            if (opt
< OPT_ARCH_MAX)
{
                break;
            }
        case '?':
            usage();
            return
-
1;
        case OPT_APPEND:
            command_line = optarg;
            break;
        case OPT_REUSE_CMDLINE:
            command_line = get_command_line();
            break;
        case OPT_RAMDISK:
            ramdisk = optarg;
            break;
        case OPT_ARGS_ELF:

            arg_style = ARG_STYLE_ELF;
            break;
        case OPT_ARGS_LINUX:
            arg_style = ARG_STYLE_LINUX;
            break;
        case OPT_ARGS_NONE:
#ifdef __i386__
            arg_style = ARG_STYLE_NONE;
#else
            die("--args-none only works on arch i386\n");
#endif
            break;
        }
    }
    command_line_len = 0;
    if (command_line)
{
        command_line_len =
strlen
(command_line)
+1;
    }

    /* Need to append some command line parameters internally in case of
     * taking crash dumps.
     */

    if (info->kexec_flags
& (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT))
{
        modified_cmdline = xmalloc(COMMAND_LINE_SIZE);
        memset((void
*)modified_cmdline, 0, COMMAND_LINE_SIZE);
        if (command_line)
{
            strncpy(modified_cmdline, command_line,
                        COMMAND_LINE_SIZE);
            modified_cmdline[COMMAND_LINE_SIZE
- 1]
=
'\0';
        }
        modified_cmdline_len =
strlen
(modified_cmdline);
    }

    /* Load the ELF executable 加载新内核,解析elf头并加载elf data,最终就是将内核各segmants(内核本质上就是一个ELF文件)弄到info中*/*/
    elf_exec_build_load(info,
&ehdr, buf, len, 0);//========================>


    entry = ehdr.e_entry;
    max_addr = elf_max_addr(&ehdr);

    /* Do we want arguments? */
    if (arg_style
!= ARG_STYLE_NONE)
{
        /* Load the setup code */
        elf_rel_build_load(info,
&info->rhdr,
(char
*
) purgatory, purgatory_size,
            0, ULONG_MAX, 1, 0);
    }
    if (arg_style
== ARG_STYLE_NONE)
{
        info->entry
= (void
*)entry;

    }
    else if
(arg_style == ARG_STYLE_ELF)
{
        unsigned long note_base;
        struct entry32_regs regs;
        uint32_t arg1, arg2;

        /* Setup the ELF boot notes */
        note_base = elf_boot_notes(info, max_addr,
            (unsigned
char *) command_line, command_line_len);

        /* Initialize the stack arguments */
        arg2 = 0;
/* No return address */
        arg1 = note_base;
        elf_rel_set_symbol(&info->rhdr,
"stack_arg32_1",
&arg1,
sizeof
(arg1));
        elf_rel_set_symbol(&info->rhdr,
"stack_arg32_2",
&arg2,
sizeof
(arg2));
        
        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr,
"entry32_regs",
&regs,
sizeof
(regs));
        regs.eip = entry;
/* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr,
"stack_arg32_2");
        elf_rel_set_symbol(&info->rhdr,
"entry32_regs",
&regs,
sizeof
(regs));

        if (ramdisk)
{
            die("Ramdisks not supported with generic elf arguments");
        }
    }
    else if
(arg_style == ARG_STYLE_LINUX)
{
        struct x86_linux_faked_param_header
*
hdr;
        unsigned long param_base;
        const unsigned
char *ramdisk_buf;
        off_t ramdisk_length;
        struct entry32_regs regs;
        int rc = 0;

        /* Get the linux parameter header */
        hdr = xmalloc(sizeof(*hdr));

        /* Hack: With some ld versions, vmlinux program headers show
         * a gap of two pages between bss segment and data segment
         * but effectively kernel considers it as bss segment and
         * overwrites the any data placed there. Hence bloat the
         * memsz of parameter segment to 16K to avoid being placed
         * in such gaps.
         * This is a makeshift solution until it is fixed in kernel
         */

        param_base = add_buffer(info, hdr,
sizeof(*hdr), 16*1024,
            16, 0, max_addr, 1);

        /* Initialize the parameter header */
        memset(hdr, 0,
sizeof(*hdr));
        init_linux_parameters(&hdr->hdr);

        /* Add a ramdisk to the current image */
        ramdisk_buf =
NULL
;
        ramdisk_length = 0;
        if (ramdisk)
{
            ramdisk_buf =
(
unsigned char
*) slurp_file(ramdisk,
&ramdisk_length);
        }

        /* If panic kernel is being loaded, additional segments need
         * to be created. */

        if (info->kexec_flags
& (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT))
{
            rc = load_crashdump_segments(info, modified_cmdline,
                        max_addr, 0);
            if (rc
< 0)
                return
-
1;
            /* Use new command line. */
            command_line = modified_cmdline;
            command_line_len =
strlen
(modified_cmdline)
+ 1;
        }

        /* Tell the kernel what is going on */
        setup_linux_bootloader_parameters(info,
&hdr->hdr, param_base,

            offsetof(struct x86_linux_faked_param_header, command_line),
            command_line, command_line_len,
            ramdisk_buf, ramdisk_length);

        /* Fill in the information bios calls would usually provide */
        setup_linux_system_parameters(&hdr->hdr, info->kexec_flags);

        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr,
"entry32_regs",
&regs,
sizeof
(regs));
        regs.ebx = 0;        /* Bootstrap processor */
        regs.esi = param_base;    /* Pointer to the parameters */
        regs.eip = entry;    /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr,
"stack_end");
/* Stack, unused */
        elf_rel_set_symbol(&info->rhdr,
"entry32_regs",
&regs,
sizeof
(regs));
    }
    else {
        die("Unknown argument style\n");
    }
    return 0;
}





static inline
long kexec_load(void
*entry,
unsigned
long nr_segments,
            struct kexec_segment
*
segments,
unsigned
long flags)//******************

{

  //最终调用sys_kexec_load陷入内核

   
return
(long) syscall(__NR_kexec_load, entry,
nr_segments, segments, flags);
}



看以下kexe -e执行的操作:


/*
 * Exec the new kernel (reboot)
 */

static int my_exec(void)
{
    int result;

    result = kexec_reboot();
    /* I have failed if I make it here */
    fprintf(stderr,
"kexec failed: %s\n",
        strerror(errno));
    return -1;
}

static inline
long kexec_reboot(void)
{

  //最终调用sys_reboot,注意红色标志
    return (long) syscall(__NR_reboot, LINUX_REBOOT_MAGIC1,
LINUX_REBOOT_MAGIC2,
LINUX_REBOOT_CMD_KEXEC
, 0);
}


+++++++++++++++++++++++trap  into kernel,我是分隔线+++++++++++++++++++++++++++++


asmlinkage
long sys_kexec_load(unsigned
long entry,
unsigned long nr_segments,
                struct kexec_segment __user
*
segments,
                unsigned
long
flags)
{
    struct kimage
*
*dest_image,
*image;
    int locked;
    int result;

    /* We only trust the superuser with rebooting the system. */
    if (!capable(CAP_SYS_BOOT))
        return -EPERM;

    /*
     * Verify we have a legal set of flags
     * This leaves us room for future extensions.
     */

    if ((flags
& KEXEC_FLAGS)
!=
(
flags & ~KEXEC_ARCH_MASK))
        return -EINVAL;

    /* Verify we are on the appropriate architecture */
    if (((flags
& KEXEC_ARCH_MASK)
!= KEXEC_ARCH)
&&
        ((flags
& KEXEC_ARCH_MASK)
!= KEXEC_ARCH_DEFAULT))
        return -EINVAL;

    /* Put an artificial cap on the number
     * of segments passed to kexec_load.
     */

    if (nr_segments
> KEXEC_SEGMENT_MAX)
        return -EINVAL;

    image = NULL;
    result = 0;

    /* Because we write directly to the reserved memory
     * region when loading crash kernels we need a mutex here to
     * prevent multiple crash kernels from attempting to load
     * simultaneously, and to prevent a crash kernel from loading
     * over the top of a in use crash kernel.
     *
     * KISS: always take the mutex.
     */

    locked = xchg(&kexec_lock, 1);
    if (locked)
        return -EBUSY;

    dest_image = &kexec_image;//选此个
    if (flags
& KEXEC_ON_CRASH)
        dest_image =
&
kexec_crash_image;//kdump时
    if (nr_segments
> 0)
{

        unsigned long i;

        /* Loading another kernel to reboot into */
        if ((flags
& KEXEC_ON_CRASH)
== 0)//===========>kexec标志与后进入此

            result = kimage_normal_alloc(&image, entry,
                            nr_segments, segments);// 1.分配kimage和一个控制页空间,并拷贝各个段到内核空间

        /* Loading another kernel to switch to if this one crashes */
        else if
(flags & KEXEC_ON_CRASH)
{//kdump相关

            /* Free any current crash dump kernel before
             * we corrupt it.
             */

            kimage_free(xchg(&kexec_crash_image,
NULL));
            result = kimage_crash_alloc(&image, entry,
                         nr_segments, segments);
        }
        if (result)
            goto out;

        result = machine_kexec_prepare(image);//x86为空

        if (result)
            goto out;

        for (i
= 0; i
<
nr_segments; i++)
{
            result = kimage_load_segment(image,
&image->segment[i]);//加载各个段?????1.处不是做过了么,不太明白

            if (result)
                goto out;
        }
        result = kimage_terminate(image);
        if (result)
            goto out;
    }
    /* Install the new kernel, and Uninstall the old */
    image = xchg(dest_image, image);//应该是交换两个数据,将image的数据放到dest_image上?????

out:
    locked = xchg(&kexec_lock, 0);
/* Release the mutex */
    BUG_ON(!locked);
    kimage_free(image);

    return result;
}




static int kimage_normal_alloc(struct kimage
**rimage,
unsigned long entry,
                unsigned
long
nr_segments,
                struct kexec_segment __user
*
segments)
{
    int result;
    struct kimage
*
image;

    /* Allocate and initialize a controlling structure */
    image = NULL;
    result = do_kimage_alloc(&image, entry, nr_segments,
segments);//分配一个kimage大小的空间

    if (result)
        goto out;

    *rimage = image;

    /*
     * Find a location for the control code buffer, and add it
     * the vector of segments so that it's pages will also be
     * counted as destination pages.
     */

    result = -ENOMEM;
    image->control_code_page
= kimage_alloc_control_pages(image,
                     get_order(KEXEC_CONTROL_CODE_SIZE));//分配控制页

    if (!image->control_code_page)
{
        printk(KERN_ERR
"Could not allocate control_code_buffer\n"
);
        goto out;
    }

    result = 0;
 out:
    if (result
== 0)
        *rimage = image;
    else
        kfree(image);

    return result;
}


简单小结下此阶段作的工作:
     就是将kexec -l命令行的新内核先拷贝到kernel_buf,然后再加载到结构体kexec_info info中,最后通过sys_kexec_load系统调用转移到动态内核内存页,这个系统调用给每个从用户空间传递而来的段分配动态内核页,并将段拷贝到这些内核页上



/*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
 * so that some mistake won't make this reboot the whole machine.
 * You can also set the meaning of the ctrl-alt-del-key here.
 *
 * reboot doesn't sync: do that yourself before calling this.
 */

asmlinkage long sys_reboot(int magic1,
int magic2,
unsigned int cmd,
void __user *
arg)// 0.kump相关=====>

{
    char buffer[256];

    /* We only trust the superuser with rebooting the system. */
    if (!capable(CAP_SYS_BOOT))
        return -EPERM;

    /* For safety, we require "magic" arguments. */
    if (magic1
!= LINUX_REBOOT_MAGIC1
||
     (magic2 != LINUX_REBOOT_MAGIC2
&&
     magic2 != LINUX_REBOOT_MAGIC2A
&&
            magic2 != LINUX_REBOOT_MAGIC2B
&&
     magic2 != LINUX_REBOOT_MAGIC2C))
        return -EINVAL;

    /* Instead of trying to make the power_off code look like
     * halt when pm_power_off is not set do it the easy way.
     */

    if ((cmd
== LINUX_REBOOT_CMD_POWER_OFF)
&&
!
pm_power_off)
        cmd = LINUX_REBOOT_CMD_HALT;

    lock_kernel();
    switch (cmd)
{
    case LINUX_REBOOT_CMD_RESTART:
        kernel_restart(NULL);
        break;

    case LINUX_REBOOT_CMD_CAD_ON:
        C_A_D = 1;
        break;

    case LINUX_REBOOT_CMD_CAD_OFF:
        C_A_D = 0;
        break;

    case LINUX_REBOOT_CMD_HALT:
        kernel_halt();
        unlock_kernel();
        do_exit(0);
        break;

    case LINUX_REBOOT_CMD_POWER_OFF:
        kernel_power_off();
        unlock_kernel();
        do_exit(0);
        break;

    case LINUX_REBOOT_CMD_RESTART2:
        if (strncpy_from_user(&buffer[0],
arg,
sizeof
(buffer)
- 1)
<
0) {
            unlock_kernel();
            return
-
EFAULT;
        }
        buffer[sizeof(buffer)
- 1]
=
'\0';

        kernel_restart(buffer);
        break;

    case LINUX_REBOOT_CMD_KEXEC://还记得此标志否

        kernel_kexec();//=========>

        unlock_kernel();
        return -EINVAL;

#ifdef CONFIG_HIBERNATION
    case LINUX_REBOOT_CMD_SW_SUSPEND:
        {
            int ret
=
hibernate();
            unlock_kernel();
            return ret;
        }
#endif

    default:
        unlock_kernel();
        return -EINVAL;
    }
    unlock_kernel();
    return 0;
}



/**
 *    kernel_kexec - reboot the system
 *
 *    Move into place and start executing a preloaded standalone
 *    executable. If nothing was preloaded return an error.
 */

static void kernel_kexec(void)// 1. kdump相关

{
#ifdef CONFIG_KEXEC
    struct kimage
*
image;
    image = xchg(&kexec_image,
NULL);
    if (!image)
        return;
    kernel_restart_prepare(NULL);
    printk(KERN_EMERG
"Starting new kernel\n"
);
    machine_shutdown();
    machine_kexec(image);//===========>

#endif
}





/*
 * Do not allocate memory (or fail in any way) in machine_kexec().
 * We are past the point of no return, committed to rebooting now.
 */

NORET_TYPE void machine_kexec(struct kimage
*image)// 2. kdump相关==========

{
    unsigned long page_list[PAGES_NR];
    void *control_page;

    /* Interrupts aren't acceptable while we reboot */
    local_irq_disable();

    control_page = page_address(image->control_code_page);//kexec_crash_image是kexec
-p时定义的?????

    memcpy(control_page, relocate_kernel, PAGE_SIZE);

    page_list[PA_CONTROL_PAGE]
= __pa(control_page);//控制页

    page_list[VA_CONTROL_PAGE]
= (unsigned
long)relocate_kernel;
    page_list[PA_PGD]
= __pa(kexec_pgd);
    page_list[VA_PGD]
= (unsigned
long)kexec_pgd;
#ifdef CONFIG_X86_PAE
    page_list[PA_PMD_0]
= __pa(kexec_pmd0);
    page_list[VA_PMD_0]
= (unsigned
long)kexec_pmd0;
    page_list[PA_PMD_1]
= __pa(kexec_pmd1);
    page_list[VA_PMD_1]
= (unsigned
long)kexec_pmd1;
#endif
    page_list[PA_PTE_0]
= __pa(kexec_pte0);//两个页表kexec_pte0和kexec_pte1

    page_list[VA_PTE_0]
= (unsigned
long)kexec_pte0;
    page_list[PA_PTE_1]
= __pa(kexec_pte1);
    page_list[VA_PTE_1]
= (unsigned
long)kexec_pte1;

    /* The segment registers are funny things, they have both a
     * visible and an invisible part. Whenever the visible part is
     * set to a specific selector, the invisible part is loaded
     * with from a table in memory. At no other time is the
     * descriptor table in memory accessed.
     *
     * I take advantage of this here by force loading the
     * segments, before I zap the gdt with an invalid value.
     */

    load_segments();//将内核数据段( __KERNEL_DS )代码段( __KERNEL_CS )值装载到段寄存器

    /* The gdt & idt are now invalid.
     * If you want to load them you must set up your own idt & gdt.
     */

    set_gdt(phys_to_virt(0),0);//并使用
GDT 和 IDT 无效

    set_idt(phys_to_virt(0),0);

    /* now call it */
    relocate_kernel((unsigned
long)image->head,
(unsigned
long
)page_list,
            image->start, cpu_has_pae);//位于/arch/i386/kernel/relocate_kernel.s中============>trap
into===========

}