Linux 内核启动过程分析----zImage自解压

要想移植内核，肯定是要知道内核的启动过程的，包括协处理器的操作。这对我们理解ARM工作方式、MMU配置，中断是很有帮助的。最近写代码太少，有时间研究内核了。下面是我个人主观对内核启动过程的分析，如有不同，请提出探讨，共同进步！
上一篇分析过了，uboot引导的uImage，最先执行的函数的是由arch/arm/boot/compressed下的vmlinx.lds文件决定的。它就是_start函数，这个函数定义在arch/arm/boot/compressed/head.S中，那么就先分析arch/arm/boot/compressed/head.S吧。
这个head.S开头定义了一些跟调试相关的宏，先不考虑。

start:
.type   start,#function
.rept   7
mov r0, r0
.endr
   ARM(     mov r0, r0      )
   ARM(     b   1f      )
 THUMB(     adr r12, BSYM(1f)   )
 THUMB(     bx  r12     )

.word   0x016f2818      @ Magic numbers to help the loader
.word   start           @ absolute load/run zImage address
.word   _edata          @ zImage end address
 THUMB(     .thumb          )
1:      mov r7, r1          @ save architecture ID
mov r8, r2          @ save atags pointer
//
/*arm/Makefile:117:KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_THUMB2) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float
在unified.h中定义了
由于我们没有定义CONFIG_THUMB2_KERNEL，因此
#define ARM(x...) x
#define THUMB(x...)
#define W(instr) instr
#define BSYM(sym) sym
*/

在start的开始处，使用rept 7，空出了个32位，现在还不明白他的目的，可能是为了留出异常向量表的位置。这里先不下决定。
//mov r7, r1 @ save architecture ID
// mov r8, r2 @ save atags pointer
这两句是关键，就是这两行保存了uboot传进来的参数。
uboot的最后执行的是thekernel(0,archid,bd->bi_bootatas);
而这个thekernel正好是跳转到了start这个位置。其中的参数不足4个，分别用r0/r1/r2寄存,其中r0=0;r1=archid,r2=atags参数的首地址。
这是第一步：将uboot传进来的arch id 和 atags指针保存在r7、r8中。

#ifndef __ARM_ARCH_2__
/*
 * Booting from Angel - need to enter SVC mode and disable
 * FIQs/IRQs (numeric definitions from angel arm.h source).
 * We only do this if we were in user mode on entry.
 */
        mrs r2, cpsr        @ get current mode
tst r2, #3 @ not user?在uboot阶段已经处于SVC模式
        bne not_angel  //此处条件为真，跳转到not_angel
mov r0, #0x17 @ angel_SWIreason_EnterSVC
 ARM(       swi 0x123456    )   @ angel_SWI_ARM
 THUMB(     svc 0xab        )   @ angel_SWI_THUMB
not_angel:
        mrs r2, cpsr        @ turn off interrupts to
        orr r2, r2, #0xc0 @ prevent angel from running
        msr cpsr_c, r2 
        //将CPSR的bit6bit7设置为1.对应的是I位和F位：当I=1时禁止IRQ中断，当F=1时禁止FIQ中断，因此作用就是关闭中断和快速中断
#else
        teqp    pc, #0x0c000003 @ turn off interrupts
#endif
    ldr r4, =zreladdr //定义在arch/arm/cpu/mach-xxx/Makefile.boot，我这的值为        /*
 1 zreladdr-y := 0x80008000
 2 params_phys-y := 0x00000100
 3 initrd_phys-y := 0x00800000
 */
    bl  cache_on   //开启cache。

cache_on操作这一部分的代码如下：

cache_on:   mov r3, #8 @ cache_on function
        b   call_cache_fn
call_cache_fn:  adr r12, proc_types
#ifdef CONFIG_CPU_CP15
        mrc p15, 0, r9, c0, c0  @ get processor ID
#else
        ldr r9, =CONFIG_PROCESSOR_ID
#endif
1:      ldr r1, [r12, #0] @ get value
        ldr r2, [r12, #4] @ get mask
eor r1, r1, r9      @ (real ^ match)//显然如果r1==r9,那么r1=0
tst r1, r2          @       & mask //r1&r2=0？
 ARM(       addeq   pc, r12, r3     ) @ call cache function
 THUMB(     addeq   r12, r3         )
 THUMB(     moveq   pc, r12         ) @ call cache function
add r12, r12, #PROC_ENTRY_SIZE 
/* compressed/head.S:613:#define PROC_ENTRY_SIZE (4*5)
因为proc_types开始处每隔5条arm指令就是下一个processer.
*/
        b   1b

/*
 * Table for cache operations. This is basically:
 * - CPU ID match
 * - CPU ID mask
 * - 'cache on' method instruction
 * - 'cache off' method instruction
 * - 'cache flush' method instruction
 *
 * We match an entry using: ((real_id ^ match) & mask) == 0
 *
 * Writethrough caches generally only need 'on' and 'off'
 * methods. Writeback caches _must_ have the flush method
 * defined.
 */
.align  2
.type   proc_types,#object
proc_types:
.word   0x41560600      @ ARM6/610
.word   0xffffffe0
        W(b)    __arm6_mmu_cache_off    @ works, but slow
        W(b)    __arm6_mmu_cache_off
mov pc, lr
#if !defined(CONFIG_CPU_V7)
/* This collides with some V7 IDs, preventing correct detection */
.word   0x00000000      @ old ARM ID
.word   0x0000f000
mov pc, lr
mov pc, lr
mov pc, lr
#endif

.word   0x41007000      @ ARM7/710
.word   0xfff8fe00
        W(b)    __arm7_mmu_cache_off
        W(b)    __arm7_mmu_cache_off
mov pc, lr

.word   0x41807200      @ ARM720T (writethrough)
.word   0xffffff00
        W(b)    __armv4_mmu_cache_on
        W(b)    __armv4_mmu_cache_off
mov pc, lr

    ..../*此处定义了多种支持的processor*/.......

.word   0x000f0000      @ new CPU Id
.word   0x000f0000
        W(b)    __armv7_mmu_cache_on
        W(b)    __armv7_mmu_cache_off
        W(b)    __armv7_mmu_cache_flush
.word   0           @ unrecognised type
.word   0
mov pc, lr
 THUMB(     nop             )
mov pc, lr
 THUMB(     nop             )
mov pc, lr
 THUMB(     nop             )
.size   proc_types, . - proc_types

上述条件如果匹配成功了，那么就会执行procss_type+8个字节(r3指定)处的指令，对于armv7处，执行的就是__armv7_mmu_cache_on。

        .word   0x000f0000      @ new CPU Id
.word   0x000f0000
        W(b)    __armv7_mmu_cache_on
        W(b)    __armv7_mmu_cache_off
        W(b)    __armv7_mmu_cache_flush

__armv7_mmu_cache_on:
mov r12, lr         //保存返回地址到r12.
#ifdef CONFIG_MMU
        mrc p15, 0, r11, c0, c1, 4  @ read ID_MMFR0
tst r11, #0xf @ VMSA
        blne    __setup_mmu
mov r0, #0
        mcr p15, 0, r0, c7, c10, 4  @ drain write buffer
tst r11, #0xf @ VMSA
        mcrne   p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
#endif
        mrc p15, 0, r0, c1, c0, 0   @ read control reg
        bic r0, r0, #1 << 28 @ clear SCTLR.TRE
        orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement
        orr r0, r0, #0x003c @ write buffer
#ifdef CONFIG_MMU
#ifdef CONFIG_CPU_ENDIAN_BE8
        orr r0, r0, #1 << 25 @ big-endian page tables
#endif
        orrne   r0, r0, #1 @ MMU enabled
        movne   r1, #-1
        mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
        mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
#endif
        mcr p15, 0, r0, c7, c5, 4   @ ISB
        mcr p15, 0, r0, c1, c0, 0   @ load control register
        mrc p15, 0, r0, c1, c0, 0   @ and read it back
mov r0, #0
        mcr p15, 0, r0, c7, c5, 4   @ ISB
mov pc, r12    //这里返回，返回到bl     cache_on的下一条指令。

开启cache的过程中会设置MMU，完成chache_on的工作后，返回到了bl    cache_on的下一条指令。

bl  cache_on
/*
 1 zreladdr-y := 0x80008000
 2 params_phys-y := 0x00000100
 3 initrd_phys-y := 0x00800000
 */

restart:
        adr r0, LC0
        ldmia   r0, {r1, r2, r3, r6, r10, r11, r12}
        ldr sp, [r0, #28]

/*
 * We might be running at a different address. We need
 * to fix up various pointers.
 */
sub r0, r0, r1      @ calculate the delta offset
add r6, r6, r0      @ _edata
add r10, r10, r0        @ inflated kernel size location
/*r6存放了运行时_edata的地址，r10存放了运行时候input_data_end-4 的地址。*/
/*
 * The kernel build system appends the size of the
 * decompressed kernel at the end of the compressed data
 * in little-endian form.
 */
        ldrb    r9, [r10, #0]
        ldrb    lr, [r10, #1]
        orr r9, r9, lr, lsl #8
        ldrb    lr, [r10, #2]
        ldrb    r10, [r10, #3]
        orr r9, r9, lr, lsl #16
        orr r9, r9, r10, lsl #24
/*r9存放的就是r10中的值，经过小端模式转换后的。其实还是运行时的input_data_end -4*/
#ifndef CONFIG_ZBOOT_ROM 
/*here is */
/* malloc space is above the relocated stack (64k max) */
add sp, sp, r0
add r10, sp, #0x10000
#else
/*
 * With ZBOOT_ROM the bss/stack is non relocatable,
 * but someone could still run this code from RAM,
 * in which case our reference is _edata.
 */
mov r10, r6
#endif

mov r5, #0 @ init dtb size to 0

/*
 * Check to see if we will overwrite ourselves.
 * r4 = final kernel address
 * r9 = size of decompressed image
 * r10 = end of this image, including bss/stack/malloc space if non XIP
 * We basically want:
 * r4 - 16k page directory >= r10 -> OK
 * r4 + image length <= address of wont_overwrite -> OK
 */
add r10, r10, #16384 /*16K : r10= _edat + 16K=0x82000000+_edat相对start的偏移,可以认为是image的大小，显然这个值一定是大于0x82000000*/
        cmp r4, r10             /*此处r4=0x80008000*/
        bhs wont_overwrite  //显然条件不成立。
add r10, r4, r9 //r10=0x80008000+size of decompressed image
        adr r9, wont_overwrite
        cmp r10, r9 /*因为我们加载内核的地址是0x8200_0000，因此此处wont_overwrite处于0x82000000以后的位置*/
        bls wont_overwrite /*一般内核镜像不会太大，这里一般条件是成立的。r10<r9.*/

/*
 * Relocate ourselves past the end of the decompressed kernel.
 * r6 = _edata
 * r10 = end of the decompressed kernel
 * Because we always copy ahead, we need to do it from the end and go
 * backward in case the source and destination overlap.
 */
/*
 * Bump to the next 256-byte boundary with the size of
 * the relocation code added. This avoids overwriting
 * ourself when the offset is small.
 */
add r10, r10, #((reloc_code_end - restart + 256) & ~255)
        bic r10, r10, #255

/* Get start of code we want to copy and align it down. */
        adr r5, restart
        bic r5, r5, #31

sub r9, r6, r5      @ size to copy
add r9, r9, #31 @ rounded up to a multiple
        bic r9, r9, #31 @ ... of 32 bytes
add r6, r9, r5
add r9, r9, r10

1:      ldmdb   r6!, {r0 - r3, r10 - r12, lr}
        cmp r6, r5
        stmdb   r9!, {r0 - r3, r10 - r12, lr}
        bhi 1b

/* Preserve offset to relocated code. */
sub r6, r9, r6

#ifndef CONFIG_ZBOOT_ROM
/* cache_clean_flush may use the stack, so relocate it */
add sp, sp, r6
#endif

        bl  cache_clean_flush

        adr r0, BSYM(restart)
add r0, r0, r6
mov pc, r0

wont_overwrite:
/*
 * If delta is zero, we are running at the address we were linked at.
 * r0 = delta
 * r2 = BSS start
 * r3 = BSS end
 * r4 = kernel execution address
 * r5 = appended dtb size (0 if not present)
 * r7 = architecture ID
 * r8 = atags pointer
 * r11 = GOT start
 * r12 = GOT end
 * sp = stack pointer
 */
        orrs    r1, r0, r5
        beq not_relocated

add r11, r11, r0
add r12, r12, r0

#ifndef CONFIG_ZBOOT_ROM
/*
 * If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
 * we need to fix up pointers into the BSS region.
 * Note that the stack pointer has already been fixed up.
 */
add r2, r2, r0
add r3, r3, r0

/*
 * Relocate all entries in the GOT table.
 * Bump bss entries to _edata + dtb size
 */
1:      ldr r1, [r11, #0] @ relocate entries in the GOT
add r1, r1, r0      @ This fixes up C references
        cmp r1, r2          @ if entry >= bss_start &&
        cmphs   r3, r1          @       bss_end > entry
        addhi   r1, r1, r5      @    entry += dtb size
        str r1, [r11], #4 @ next entry
        cmp r11, r12
        blo 1b

/* bump our bss pointers too */
add r2, r2, r5
add r3, r3, r5

#else

/*
 * Relocate entries in the GOT table. We only relocate
 * the entries that are outside the (relocated) BSS region.
 */
1:      ldr r1, [r11, #0] @ relocate entries in the GOT
        cmp r1, r2          @ entry < bss_start ||
        cmphs   r3, r1          @ _end < entry
        addlo   r1, r1, r0      @ table.  This fixes up the
        str r1, [r11], #4 @ C references.
        cmp r11, r12
        blo 1b
#endif

not_relocated:  mov r0, #0
1:      str r0, [r2], #4 @ clear bss
        str r0, [r2], #4
        str r0, [r2], #4
        str r0, [r2], #4
        cmp r2, r3
        blo 1b

/*
 * The C runtime environment should now be setup sufficiently.
 * Set up some pointers, and start decompressing.
 * r4 = kernel execution address
 * r7 = architecture ID
 * r8 = atags pointer
 */
mov r0, r4  //0x80008000
mov r1, sp          @ malloc space above stack
add r2, sp, #0x10000 @ 64k max
mov r3, r7  //archID
        bl  decompress_kernel
        bl  cache_clean_flush
        bl  cache_off
mov r0, #0 @ must be zero
mov r1, r7          @ restore architecture number
mov r2, r8          @ restore atags pointer
 ARM(       mov pc, r4  )       @ call kernel
 THUMB(     bx  r4  )       @ entry point is always ARM

.align  2
.type   LC0, #object
LC0:        .word   LC0         @ r1
.word   __bss_start     @ r2
.word   _end            @ r3
.word   _edata          @ r6
.word   input_data_end - 4  @ r10 (inflated size location,sizeof arch/arm/boot/compressed/piggy.gzip)
.word   _got_start      @ r11
.word   _got_end        @ ip
.word   .L_user_stack_end   @ sp
.size   LC0, . - LC0

上述代码的功能
1：就是修正运行地址与链接地址之间的差异，包括got表。
.word __bss_start @ r2
.word _end @ r3
.word _edata @ r6
.word input_data_end - 4 @ r10 (inflated size location,sizeof arch/arm/boot/compressed/piggy.gzip)
.word _got_start @ r11
.word _got_end @ ip
.word .L_user_stack_end @ sp
这样才能使得我们的程序正确地运行。比如说我的_bss_start链接地址是0xC0012345.此处有个变量val=10;但是我们在运行uImage的时候，这个bss_start可能是跟链接地址是不同的，如运行的时候处于0x82012345.那么显然，如果我们要想得到val=10，就需要从0x82012345这里获得，而不是0xC0012345，因为0xC0012345的地方现在来说内容还是未知的。修正完了这些地址后，程序才能正常地运行。
功能2：
代码拷贝，拷贝范围是[restart,_edata]当然是32位对齐的。
从restart+sizeof copy 的地方拷贝到 size+ end of the decompressed kernel的地方。

not_relocated:  mov r0, #0
1:      str r0, [r2], #4 @ clear bss
        str r0, [r2], #4
        str r0, [r2], #4
        str r0, [r2], #4
        cmp r2, r3
        blo 1b

/*
 * The C runtime environment should now be setup sufficiently.
 * Set up some pointers, and start decompressing.
 * r4 = kernel execution address
 * r7 = architecture ID
 * r8 = atags pointer
 */
mov r0, r4  //0x80008000
mov r1, sp          @ malloc space above stack
add r2, sp, #0x10000 @ 64k max
mov r3, r7  //archID
        bl  decompress_kernel
        bl  cache_clean_flush
        bl  cache_off
mov r0, #0 @ must be zero
mov r1, r7          @ restore architecture number
mov r2, r8          @ restore atags pointer
 ARM(       mov pc, r4  )       @ call kernel
 THUMB(     bx  r4  )       @ entry point is always ARM

在执行上述代码过程中，寄存器的变化：
r10：运行时的input_data_end
r9: 运行时的input_data_end
r6: 运行时的_edata

r10:运行时的_edata
r10=运行时的_edata + 16K,0x82000000+
r4=0x80008000
if(r10

  1         .section .piggydata,#alloc
2         .globl  input_data
3 input_data:
4         .incbin "arch/arm/boot/compressed/piggy.gzip"
5         .globl  input_data_end
6 input_data_end:
~                                                                               
~                                  
   . = TEXT_START;
27   _text = .;
28 
29   .text : {
30     _start = .;
31     *(.start)
32     *(.text)
33     *(.text.*)
34     *(.fixup)
35     *(.gnu.warning)
36     *(.glue_7t)
37     *(.glue_7)
38   }
39   .rodata : {
40     *(.rodata)
41     *(.rodata.*)
42   }
43   .piggydata : {
44     *(.piggydata)
45   }
46

如果不需要映射

则清楚bss段，并且执行了decompress_kernel函数解压内核。

void
decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p,
unsigned long free_mem_ptr_end_p,
int arch_id)
{
int ret;

    output_data     = (unsigned char *)output_start;
    free_mem_ptr        = free_mem_ptr_p;
    free_mem_end_ptr    = free_mem_ptr_end_p;
    __machine_arch_type = arch_id;

    arch_decomp_setup();

    putstr("Uncompressing Linux...");
    ret = do_decompress(input_data, input_data_end - input_data,
                output_data, error);
if (ret)
        error("decompressor returned an error");
else
        putstr(" done, booting the kernel.\n");
}

完成内核解压后，又关闭cache，且继续保存uboot传进来的参数


        bl  decompress_kernel
        bl  cache_clean_flush
        bl  cache_off
mov r0, #0 @ must be zero
mov r1, r7          @ restore architecture number
mov r2, r8          @ restore atags pointer
         ARM(       mov pc, r4  )       @ call kernel

在以上整个过程中，MMU一直都处于关闭状态，打开cache可能是为了提高解压速度。在执行decompress_kernel的时候，由mov r0,r4 传入参数0，告诉解压函数的输出地址为0x80008000，最后又由 ARM( mov pc, r4 )，成功跳转到了解压后的内核vmlinux。
由上篇的分析可以知道，uImage是zImage加上64字节的头信息得到的，而zImage又是compressed下的vmlinux经过objcopy得到的，compressed下的vmlinux是由vmlinux.lds、 head.S 和 piggy.gzip.S misc.c编译而成的，其实就是在piggy.gzip中添加了解压代码。piggy.gzip是Image经过gzip -n -f -9得到的，Image是源码目录下的vmlinux经过objcopy后得到的。因此如果zImage进行自解压，解压后的指令序列跟源码目录下的vmlinux的指令序列就应该是一样的。所以，zImage进行自解压后,最后一句ARM( mov pc, r4 )就跳转到了源码根目录下的vmlinux中。其中decompress_kernel准确无误地将vmlinux的内容解压到了r4寄存器指定的地址。
总结一下：
1.判断是否为管理模式(条件当然是成立的,在uboot已经设置过了)
2.设置CPSR寄存器禁止IRQ和FIRQ
3.开启cache(要经过processor查找匹配)
4.fixup 段位置
5.判断是否满足内核自解压内存的环境。检查会不会自我覆盖，如果不会发生自我覆盖则走第6步。我们这里运行时的wont_overwrite指针大于0x80008000+piggy.gzip的大小。因此条件是成立的。直接走6.
6.清除BSS段。
7.设置栈，准备好r0-r3的参数，执行decompress_kernel
8.关闭cache
9.跳转至zaddr处。在arch/arm/mach-xxx/Makefile.boot中定义
执行vmlinux。

上面分析的就是从uImage到vmlinux的跳转过程了。下篇在介绍vmlinux。

秒客网

Linux 内核启动过程分析----zImage自解压

相关文章