Linux 内核启动过程分析----zImage自解压

时间:2021-09-09 16:45:46

要想移植内核,肯定是要知道内核的启动过程的,包括协处理器的操作。这对我们理解ARM工作方式、MMU配置,中断是很有帮助的。最近写代码太少,有时间研究内核了。下面是我个人主观对内核启动过程的分析,如有不同,请提出探讨,共同进步!
上一篇分析过了,uboot引导的uImage,最先执行的函数的是由arch/arm/boot/compressed下的vmlinx.lds文件决定的。它就是_start函数,这个函数定义在arch/arm/boot/compressed/head.S中,那么就先分析arch/arm/boot/compressed/head.S吧。
这个head.S开头定义了一些跟调试相关的宏,先不考虑。

start:
.type start,#function
.rept 7
mov r0, r0
.endr
ARM( mov r0, r0 )
ARM( b 1f )
THUMB( adr r12, BSYM(1f) )
THUMB( bx r12 )

.word 0x016f2818 @ Magic numbers to help the loader
.word start @ absolute load/run zImage address
.word _edata @ zImage end address
THUMB( .thumb )
1: mov r7, r1 @ save architecture ID
mov r8, r2 @ save atags pointer
//
/*arm/Makefile:117:KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_THUMB2) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float
在unified.h中定义了
由于我们没有定义CONFIG_THUMB2_KERNEL,因此
#define ARM(x...) x
#define THUMB(x...)
#define W(instr) instr
#define BSYM(sym) sym
*/

在start的开始处,使用rept 7,空出了个32位,现在还不明白他的目的,可能是为了留出异常向量表的位置。这里先不下决定。
//mov r7, r1 @ save architecture ID
// mov r8, r2 @ save atags pointer
这两句是关键,就是这两行保存了uboot传进来的参数。
uboot的最后执行的是thekernel(0,archid,bd->bi_bootatas);
而这个thekernel正好是跳转到了start这个位置。其中的参数不足4个,分别用r0/r1/r2寄存,其中r0=0;r1=archid,r2=atags参数的首地址。
这是第一步:将uboot传进来的arch id 和 atags指针保存在r7、r8中。

#ifndef __ARM_ARCH_2__
/*
* Booting from Angel - need to enter SVC mode and disable
* FIQs/IRQs (numeric definitions from angel arm.h source).
* We only do this if we were in user mode on entry.
*/

mrs r2, cpsr @ get current mode
tst r2, #3 @ not user?在uboot阶段已经处于SVC模式
bne not_angel //此处条件为真,跳转到not_angel
mov r0, #0x17 @ angel_SWIreason_EnterSVC
ARM( swi 0x123456 ) @ angel_SWI_ARM
THUMB( svc 0xab ) @ angel_SWI_THUMB
not_angel:
mrs r2, cpsr @ turn off interrupts to
orr r2, r2, #0xc0 @ prevent angel from running
msr cpsr_c, r2
//将CPSR的bit6bit7设置为1.对应的是I位和F位:当I=1时禁止IRQ中断,当F=1时禁止FIQ中断,因此作用就是关闭中断和快速中断
#else
teqp pc, #0x0c000003 @ turn off interrupts
#endif
ldr r4, =zreladdr //定义在arch/arm/cpu/mach-xxx/Makefile.boot,我这的值为 /*
1 zreladdr-y := 0x80008000
2 params_phys-y := 0x00000100
3 initrd_phys-y := 0x00800000
*/

bl cache_on //开启cache。

cache_on操作这一部分的代码如下:

cache_on:   mov r3, #8 @ cache_on function
b call_cache_fn
call_cache_fn: adr r12, proc_types
#ifdef CONFIG_CPU_CP15
mrc p15, 0, r9, c0, c0 @ get processor ID
#else
ldr r9, =CONFIG_PROCESSOR_ID
#endif
1: ldr r1, [r12, #0] @ get value
ldr r2, [r12, #4] @ get mask
eor r1, r1, r9 @ (real ^ match)//显然如果r1==r9,那么r1=0
tst r1, r2 @ & mask //r1&r2=0
ARM( addeq pc, r12, r3 ) @ call cache function
THUMB( addeq r12, r3 )
THUMB( moveq pc, r12 ) @ call cache function
add r12, r12, #PROC_ENTRY_SIZE
/* compressed/head.S:613:#define PROC_ENTRY_SIZE (4*5)
因为proc_types开始处每隔5条arm指令就是下一个processer.
*/

b 1b

/*
* Table for cache operations. This is basically:
* - CPU ID match
* - CPU ID mask
* - 'cache on' method instruction
* - 'cache off' method instruction
* - 'cache flush' method instruction
*
* We match an entry using: ((real_id ^ match) & mask) == 0
*
* Writethrough caches generally only need 'on' and 'off'
* methods. Writeback caches _must_ have the flush method
* defined.
*/

.align 2
.type proc_types,#object
proc_types:
.word 0x41560600 @ ARM6/610
.word 0xffffffe0
W(b) __arm6_mmu_cache_off @ works, but slow
W(b) __arm6_mmu_cache_off
mov pc, lr
#if !defined(CONFIG_CPU_V7)
/* This collides with some V7 IDs, preventing correct detection */
.word 0x00000000 @ old ARM ID
.word 0x0000f000
mov pc, lr
mov pc, lr
mov pc, lr
#endif

.word 0x41007000 @ ARM7/710
.word 0xfff8fe00
W(b) __arm7_mmu_cache_off
W(b) __arm7_mmu_cache_off
mov pc, lr

.word 0x41807200 @ ARM720T (writethrough)
.word 0xffffff00
W(b) __armv4_mmu_cache_on
W(b) __armv4_mmu_cache_off
mov pc, lr

..../*此处定义了多种支持的processor*/.......

.word 0x000f0000 @ new CPU Id
.word 0x000f0000
W(b) __armv7_mmu_cache_on
W(b) __armv7_mmu_cache_off
W(b) __armv7_mmu_cache_flush
.word 0 @ unrecognised type
.word 0
mov pc, lr
THUMB( nop )
mov pc, lr
THUMB( nop )
mov pc, lr
THUMB( nop )
.size proc_types, . - proc_types

上述条件如果匹配成功了,那么就会执行procss_type+8个字节(r3指定)处的指令,对于armv7处,执行的就是__armv7_mmu_cache_on。

        .word   0x000f0000      @ new CPU Id
.word 0x000f0000
W(b) __armv7_mmu_cache_on
W(b) __armv7_mmu_cache_off
W(b) __armv7_mmu_cache_flush

__armv7_mmu_cache_on:
mov r12, lr //保存返回地址到r12.
#ifdef CONFIG_MMU
mrc p15, 0, r11, c0, c1, 4 @ read ID_MMFR0
tst r11, #0xf @ VMSA
blne __setup_mmu
mov r0, #0
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
tst r11, #0xf @ VMSA
mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
#endif
mrc p15, 0, r0, c1, c0, 0 @ read control reg
bic r0, r0, #1 << 28 @ clear SCTLR.TRE
orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement
orr r0, r0, #0x003c @ write buffer
#ifdef CONFIG_MMU
#ifdef CONFIG_CPU_ENDIAN_BE8
orr r0, r0, #1 << 25 @ big-endian page tables
#endif
orrne r0, r0, #1 @ MMU enabled
movne r1, #-1
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
#endif
mcr p15, 0, r0, c7, c5, 4 @ ISB
mcr p15, 0, r0, c1, c0, 0 @ load control register
mrc p15, 0, r0, c1, c0, 0 @ and read it back
mov r0, #0
mcr p15, 0, r0, c7, c5, 4 @ ISB
mov pc, r12 //这里返回,返回到bl cache_on的下一条指令。
开启cache的过程中会设置MMU,完成chache_on的工作后,返回到了bl    cache_on的下一条指令。
bl  cache_on
/*
1 zreladdr-y := 0x80008000
2 params_phys-y := 0x00000100
3 initrd_phys-y := 0x00800000
*/


restart:
adr r0, LC0
ldmia r0, {r1, r2, r3, r6, r10, r11, r12}
ldr sp, [r0, #28]

/*
* We might be running at a different address. We need
* to fix up various pointers.
*/

sub r0, r0, r1 @ calculate the delta offset
add r6, r6, r0 @ _edata
add r10, r10, r0 @ inflated kernel size location
/*r6存放了运行时_edata的地址,r10存放了运行时候input_data_end-4 的地址。*/
/*
* The kernel build system appends the size of the
* decompressed kernel at the end of the compressed data
* in little-endian form.
*/

ldrb r9, [r10, #0]
ldrb lr, [r10, #1]
orr r9, r9, lr, lsl #8
ldrb lr, [r10, #2]
ldrb r10, [r10, #3]
orr r9, r9, lr, lsl #16
orr r9, r9, r10, lsl #24
/*r9存放的就是r10中的值,经过小端模式转换后的。其实还是运行时的input_data_end -4*/
#ifndef CONFIG_ZBOOT_ROM
/*here is */
/* malloc space is above the relocated stack (64k max) */
add sp, sp, r0
add r10, sp, #0x10000
#else
/*
* With ZBOOT_ROM the bss/stack is non relocatable,
* but someone could still run this code from RAM,
* in which case our reference is _edata.
*/

mov r10, r6
#endif

mov r5, #0 @ init dtb size to 0

/*
* Check to see if we will overwrite ourselves.
* r4 = final kernel address
* r9 = size of decompressed image
* r10 = end of this image, including bss/stack/malloc space if non XIP
* We basically want:
* r4 - 16k page directory >= r10 -> OK
* r4 + image length <= address of wont_overwrite -> OK
*/

add r10, r10, #16384 /*16K : r10= _edat + 16K=0x82000000+_edat相对start的偏移,可以认为是image的大小,显然这个值一定是大于0x82000000*/
cmp r4, r10 /*此处r4=0x80008000*/
bhs wont_overwrite //显然条件不成立。
add r10, r4, r9 //r10=0x80008000+size of decompressed image
adr r9, wont_overwrite
cmp r10, r9 /*因为我们加载内核的地址是0x8200_0000,因此此处wont_overwrite处于0x82000000以后的位置*/
bls wont_overwrite /*一般内核镜像不会太大,这里一般条件是成立的。r10<r9.*/

/*
* Relocate ourselves past the end of the decompressed kernel.
* r6 = _edata
* r10 = end of the decompressed kernel
* Because we always copy ahead, we need to do it from the end and go
* backward in case the source and destination overlap.
*/

/*
* Bump to the next 256-byte boundary with the size of
* the relocation code added. This avoids overwriting
* ourself when the offset is small.
*/

add r10, r10, #((reloc_code_end - restart + 256) & ~255)
bic r10, r10, #255

/* Get start of code we want to copy and align it down. */
adr r5, restart
bic r5, r5, #31

sub r9, r6, r5 @ size to copy
add r9, r9, #31 @ rounded up to a multiple
bic r9, r9, #31 @ ... of 32 bytes
add r6, r9, r5
add r9, r9, r10

1: ldmdb r6!, {r0 - r3, r10 - r12, lr}
cmp r6, r5
stmdb r9!, {r0 - r3, r10 - r12, lr}
bhi 1b

/* Preserve offset to relocated code. */
sub r6, r9, r6

#ifndef CONFIG_ZBOOT_ROM
/* cache_clean_flush may use the stack, so relocate it */
add sp, sp, r6
#endif

bl cache_clean_flush

adr r0, BSYM(restart)
add r0, r0, r6
mov pc, r0

wont_overwrite:
/*
* If delta is zero, we are running at the address we were linked at.
* r0 = delta
* r2 = BSS start
* r3 = BSS end
* r4 = kernel execution address
* r5 = appended dtb size (0 if not present)
* r7 = architecture ID
* r8 = atags pointer
* r11 = GOT start
* r12 = GOT end
* sp = stack pointer
*/

orrs r1, r0, r5
beq not_relocated

add r11, r11, r0
add r12, r12, r0

#ifndef CONFIG_ZBOOT_ROM
/*
* If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
* we need to fix up pointers into the BSS region.
* Note that the stack pointer has already been fixed up.
*/

add r2, r2, r0
add r3, r3, r0

/*
* Relocate all entries in the GOT table.
* Bump bss entries to _edata + dtb size
*/

1: ldr r1, [r11, #0] @ relocate entries in the GOT
add r1, r1, r0 @ This fixes up C references
cmp r1, r2 @ if entry >= bss_start &&
cmphs r3, r1 @ bss_end > entry
addhi r1, r1, r5 @ entry += dtb size
str r1, [r11], #4 @ next entry
cmp r11, r12
blo 1b

/* bump our bss pointers too */
add r2, r2, r5
add r3, r3, r5

#else

/*
* Relocate entries in the GOT table. We only relocate
* the entries that are outside the (relocated) BSS region.
*/

1: ldr r1, [r11, #0] @ relocate entries in the GOT
cmp r1, r2 @ entry < bss_start ||
cmphs r3, r1 @ _end < entry
addlo r1, r1, r0 @ table. This fixes up the
str r1, [r11], #4 @ C references.
cmp r11, r12
blo 1b
#endif

not_relocated: mov r0, #0
1: str r0, [r2], #4 @ clear bss
str r0, [r2], #4
str r0, [r2], #4
str r0, [r2], #4
cmp r2, r3
blo 1b

/*
* The C runtime environment should now be setup sufficiently.
* Set up some pointers, and start decompressing.
* r4 = kernel execution address
* r7 = architecture ID
* r8 = atags pointer
*/

mov r0, r4 //0x80008000
mov r1, sp @ malloc space above stack
add r2, sp, #0x10000 @ 64k max
mov r3, r7 //archID
bl decompress_kernel
bl cache_clean_flush
bl cache_off
mov r0, #0 @ must be zero
mov r1, r7 @ restore architecture number
mov r2, r8 @ restore atags pointer
ARM( mov pc, r4 ) @ call kernel
THUMB( bx r4 ) @ entry point is always ARM

.align 2
.type LC0, #object
LC0: .word LC0 @ r1
.word __bss_start @ r2
.word _end @ r3
.word _edata @ r6
.word input_data_end - 4 @ r10 (inflated size location,sizeof arch/arm/boot/compressed/piggy.gzip)
.word _got_start @ r11
.word _got_end @ ip
.word .L_user_stack_end @ sp
.size LC0, . - LC0

上述代码的功能
1:就是修正运行地址与链接地址之间的差异,包括got表。
.word __bss_start @ r2
.word _end @ r3
.word _edata @ r6
.word input_data_end - 4 @ r10 (inflated size location,sizeof arch/arm/boot/compressed/piggy.gzip)
.word _got_start @ r11
.word _got_end @ ip
.word .L_user_stack_end @ sp
这样才能使得我们的程序正确地运行。比如说我的_bss_start链接地址是0xC0012345.此处有个变量val=10;但是我们在运行uImage的时候,这个bss_start可能是跟链接地址是不同的,如运行的时候处于0x82012345.那么显然,如果我们要想得到val=10,就需要从0x82012345这里获得,而不是0xC0012345,因为0xC0012345的地方现在来说内容还是未知的。修正完了这些地址后,程序才能正常地运行。
功能2:
代码拷贝,拷贝范围是[restart,_edata]当然是32位对齐的。
从restart+sizeof copy 的地方拷贝到 size+ end of the decompressed kernel的地方。

not_relocated:  mov r0, #0
1: str r0, [r2], #4 @ clear bss
str r0, [r2], #4
str r0, [r2], #4
str r0, [r2], #4
cmp r2, r3
blo 1b

/*
* The C runtime environment should now be setup sufficiently.
* Set up some pointers, and start decompressing.
* r4 = kernel execution address
* r7 = architecture ID
* r8 = atags pointer
*/

mov r0, r4 //0x80008000
mov r1, sp @ malloc space above stack
add r2, sp, #0x10000 @ 64k max
mov r3, r7 //archID
bl decompress_kernel
bl cache_clean_flush
bl cache_off
mov r0, #0 @ must be zero
mov r1, r7 @ restore architecture number
mov r2, r8 @ restore atags pointer
ARM( mov pc, r4 ) @ call kernel
THUMB( bx r4 ) @ entry point is always ARM

在执行上述代码过程中,寄存器的变化:
r10:运行时的input_data_end
r9: 运行时的input_data_end
r6: 运行时的_edata

r10:运行时的_edata
r10=运行时的_edata + 16K,0x82000000+
r4=0x80008000
if(r10

  1         .section .piggydata,#alloc
2 .globl input_data
3 input_data:
4 .incbin "arch/arm/boot/compressed/piggy.gzip"
5 .globl input_data_end
6 input_data_end:
~
~
. = TEXT_START;
27 _text = .;
28
29 .text : {
30 _start = .;
31 *(.start)
32 *(.text)
33 *(.text.*)
34 *(.fixup)
35 *(.gnu.warning)
36 *(.glue_7t)
37 *(.glue_7)
38 }
39 .rodata : {
40 *(.rodata)
41 *(.rodata.*)
42 }
43 .piggydata : {
44 *(.piggydata)
45 }
46

如果不需要映射

则清楚bss段,并且执行了decompress_kernel函数解压内核。

void
decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p,
unsigned long free_mem_ptr_end_p,
int arch_id)
{
int ret;

output_data = (unsigned char *)output_start;
free_mem_ptr = free_mem_ptr_p;
free_mem_end_ptr = free_mem_ptr_end_p;
__machine_arch_type = arch_id;

arch_decomp_setup();

putstr("Uncompressing Linux...");
ret = do_decompress(input_data, input_data_end - input_data,
output_data, error);
if (ret)
error("decompressor returned an error");
else
putstr(" done, booting the kernel.\n");
}

完成内核解压后,又关闭cache,且继续保存uboot传进来的参数


bl decompress_kernel
bl cache_clean_flush
bl cache_off
mov r0, #0 @ must be zero
mov r1, r7 @ restore architecture number
mov r2, r8 @ restore atags pointer
ARM( mov pc, r4 ) @ call kernel

在以上整个过程中,MMU一直都处于关闭状态,打开cache可能是为了提高解压速度。在执行decompress_kernel的时候,由mov r0,r4 传入参数0,告诉解压函数的输出地址为0x80008000,最后又由 ARM( mov pc, r4 ),成功跳转到了解压后的内核vmlinux。
由上篇的分析可以知道,uImage是zImage加上64字节的头信息得到的,而zImage又是compressed下的vmlinux经过objcopy得到的,compressed下的vmlinux是由vmlinux.lds、 head.S 和 piggy.gzip.S misc.c编译而成的,其实就是在piggy.gzip中添加了解压代码。piggy.gzip是Image经过gzip -n -f -9得到的,Image是源码目录下的vmlinux经过objcopy后得到的。因此如果zImage进行自解压,解压后的指令序列跟源码目录下的vmlinux的指令序列就应该是一样的。所以,zImage进行自解压后,最后一句ARM( mov pc, r4 )就跳转到了源码根目录下的vmlinux中。其中decompress_kernel准确无误地将vmlinux的内容解压到了r4寄存器指定的地址。
总结一下:
1.判断是否为管理模式(条件当然是成立的,在uboot已经设置过了)
2.设置CPSR寄存器禁止IRQ和FIRQ
3.开启cache(要经过processor查找匹配)
4.fixup 段位置
5.判断是否满足内核自解压内存的环境。检查会不会自我覆盖,如果不会发生自我覆盖则走第6步。我们这里运行时的wont_overwrite指针大于0x80008000+piggy.gzip的大小。因此条件是成立的。直接走6.
6.清除BSS段。
7.设置栈,准备好r0-r3的参数,执行decompress_kernel
8.关闭cache
9.跳转至zaddr处。在arch/arm/mach-xxx/Makefile.boot中定义
执行vmlinux。

上面分析的就是从uImage到vmlinux的跳转过程了。下篇在介绍vmlinux。