在实验中用到这一块,就去看源码分析整理了一下,全部为个人理解。有错误的地方,希望和大牛交流。
首先解释一下,我实验的目的是获得系统调用入口函数system_call的起始地址和函数大小。
在linux-3.10.1, x86 64位的系统下,系统调用的入口地址保存在MSR寄存器中,通过rdmsrl(MSR_LSTAR,ksystem_call);便可获得系统调用的入口地址,然后对该入口地址进行解析得到入口函数为system_call,具体的函数实现在/linux-3.10.1/arch/x86/kernel/entry_64.S文件中。
Entry_64.S为一个汇编文件,即system_call函数是有汇编语言实现的,在ENTRY(system_call)与END(system_call)之间有很对其他的函数定义和调用,与C语言程序的结构不同,因此system_call以及它内部包含的所有的内核函数的符号信息都保存在kallsyms文件系统中,因此根据内核栈中的返回地址查询kallsyms得到的内核符号可能只是真正调用函数内部的一个中间函数,通过实验我们也验证了这个猜想,这也是在实验中根据内核函数中一个地址本该获得的内核符号为system_call,结果得到的却是system_call_fast_path的原因。
对system_call的函数实现分析之后我们得到图1的处理流程:
图1 system_call的处理过程
为了获取到正确的内核函数信息,在内核模块中使用kallsyms_lookup_name()函数以system_call为参数可以获得它对应的内核符号信息,在得到system_call结束后的第一个函数对应的地址信息,即可计算出system_call函数所占空间大小。
对于宿主机的系统调用表以及其他内核符号的信息,使用相同的方法获取。
下面是对system_call汇编源码的一些注释:
/* * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the * stack pointer. However, it does mask the flags register for us, so * CLD and CLAC are not needed. */
/* * Register setup: * rax system call number * rdi arg0 * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C * r12-r15,rbp,rbx saved by C code, not touched. * * Interrupts are off on entry. * Only called from user space. * * XXXif we had a free scratch register we could save the RSP into the stack frame * and report it properly in ps. Unfortunately we haven't. * * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */
ENTRY(system_call)CFI_STARTPROCsimpleCFI_SIGNAL_FRAMECFI_DEF_CFArsp,KERNEL_STACK_OFFSETCFI_REGISTERrip,rcx/*CFI_REGISTERrflags,r11*/SWAPGS_UNSAFE_STACK//上面的几行代码执行了swapgs指令。修改gs寄存器从用户态切换到内核态,其实就是修改运行级别/* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */GLOBAL(system_call_after_swapgs)
movq%rsp,PER_CPU_VAR(old_rsp)//保存原来的rspmovqPER_CPU_VAR(kernel_stack),%rsp//切换到内核堆栈/* * No need to follow this irqs off/on section - it's straight * and short: */ENABLE_INTERRUPTS(CLBR_NONE)//开中断SAVE_ARGS 8,0movq %rax,ORIG_RAX-ARGOFFSET(%rsp)movq %rcx,RIP-ARGOFFSET(%rsp)CFI_REL_OFFSET rip,RIP-ARGOFFSETtestl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)//这里检查 此系统调用是否正在被跟踪jnz tracesyssystem_call_fastpath:#if __SYSCALL_MASK == ~0cmpq $__NR_syscall_max,%rax#elseandl $__SYSCALL_MASK,%eaxcmpl $__NR_syscall_max,%eax#endifja badsysmovq %r10,%rcxcall *sys_call_table(,%rax,8) # XXX: rip relative//此处调用系统调用的处理例程movq %rax,RAX-ARGOFFSET(%rsp)/* * Syscall return path ending with SYSRET (fast path) * Has incomplete stack frame and undefined top of stack. */ret_from_sys_call:movl $_TIF_ALLWORK_MASK,%edi/* edi:flagmask */sysret_check:LOCKDEP_SYS_EXITDISABLE_INTERRUPTS(CLBR_NONE)//关中断TRACE_IRQS_OFFmovl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edxandl %edi,%edxjnz sysret_carefulCFI_REMEMBER_STATE/* * sysretq will re-enable interrupts: */TRACE_IRQS_ONmovq RIP-ARGOFFSET(%rsp),%rcxCFI_REGISTERrip,rcxRESTORE_ARGS 1,-ARG_SKIP,0/*CFI_REGISTERrflags,r11*/movqPER_CPU_VAR(old_rsp), %rspUSERGS_SYSRET64
CFI_RESTORE_STATE/* Handle reschedules *//* edx:work, edi: workmask */sysret_careful:bt $TIF_NEED_RESCHED,%edxjnc sysret_signalTRACE_IRQS_ONENABLE_INTERRUPTS(CLBR_NONE)pushq_cfi %rdiSCHEDULE_USERpopq_cfi %rdijmp sysret_check
/* Handle a signal */sysret_signal:TRACE_IRQS_ONENABLE_INTERRUPTS(CLBR_NONE)#ifdef CONFIG_AUDITSYSCALLbt $TIF_SYSCALL_AUDIT,%edxjc sysret_audit#endif/* * We have a signal, or exit tracing or single-step. * These all wind up with the iret return path anyway, * so just join that path right now. */FIXUP_TOP_OF_STACK %r11, -ARGOFFSETjmp int_check_syscall_exit_work //快速系统调用完成后,跳转到正常的退出工作。//以上的工作进行快速系统调用的处理工作。badsys:movq $-ENOSYS,RAX-ARGOFFSET(%rsp)jmp ret_from_sys_call
#ifdef CONFIG_AUDITSYSCALL/* * Fast path for syscall audit without full syscall trace. * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */auditsys:movq %r10,%r9/* 6th arg: 4th syscall arg */movq %rdx,%r8/* 5th arg: 3rd syscall arg */movq %rsi,%rcx/* 4th arg: 2nd syscall arg */movq %rdi,%rdx/* 3rd arg: 1st syscall arg */movq %rax,%rsi/* 2nd arg: syscall number */movl $AUDIT_ARCH_X86_64,%edi/* 1st arg: audit arch */call __audit_syscall_entryLOAD_ARGS 0/* reload call-clobbered registers */jmp system_call_fastpath
/* * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */sysret_audit:movq RAX-ARGOFFSET(%rsp),%rsi/* second arg, syscall return value */cmpq $-MAX_ERRNO,%rsi/* is it < -MAX_ERRNO? */setbe %al/* 1 if so, 0 if not */movzbl %al,%edi/* zero-extend that into %edi */call __audit_syscall_exitmovl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edijmp sysret_check#endif/* CONFIG_AUDITSYSCALL */ //用于快速系统调用
/* Do syscall tracing */tracesys:#ifdef CONFIG_AUDITSYSCALLtestl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)jz auditsys#endifSAVE_RESTmovq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */FIXUP_TOP_OF_STACK %rdimovq %rsp,%rdicall syscall_trace_enter/* * Reload arg registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */LOAD_ARGS ARGOFFSET, 1RESTORE_REST#if __SYSCALL_MASK == ~0cmpq $__NR_syscall_max,%rax#elseandl $__SYSCALL_MASK,%eaxcmpl $__NR_syscall_max,%eax#endifja int_ret_from_sys_call/* RAX(%rsp) set to -ENOSYS above */movq %r10,%rcx/* fixup for C */call *sys_call_table(,%rax,8)movq %rax,RAX-ARGOFFSET(%rsp)/* Use IRET because user could have changed frame */
/* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */GLOBAL(int_ret_from_sys_call)DISABLE_INTERRUPTS(CLBR_NONE)TRACE_IRQS_OFFmovl $_TIF_ALLWORK_MASK,%edi/* edi:mask to check */GLOBAL(int_with_check)LOCKDEP_SYS_EXIT_IRQGET_THREAD_INFO(%rcx)movl TI_flags(%rcx),%edxandl %edi,%edxjnz int_carefulandl $~TS_COMPAT,TI_status(%rcx)
jmp retint_swapgs //这里系统调用的全部工作结束,返回到用户空间。
/* Either reschedule or signal or syscall exit tracking needed. *//* First do a reschedule test. *//* edx:work, edi: workmask */int_careful:bt $TIF_NEED_RESCHED,%edxjnc int_very_carefulTRACE_IRQS_ONENABLE_INTERRUPTS(CLBR_NONE)pushq_cfi %rdiSCHEDULE_USERpopq_cfi %rdiDISABLE_INTERRUPTS(CLBR_NONE)TRACE_IRQS_OFFjmp int_with_check
/* handle signals and tracing -- both require a full stack frame */int_very_careful:TRACE_IRQS_ONENABLE_INTERRUPTS(CLBR_NONE)int_check_syscall_exit_work:SAVE_REST/* Check for syscall exit trace */testl $_TIF_WORK_SYSCALL_EXIT,%edxjz int_signalpushq_cfi %rdileaq 8(%rsp),%rdi# &ptregs -> arg1call syscall_trace_leavepopq_cfi %rdiandl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edijmp int_restore_rest
int_signal:testl $_TIF_DO_NOTIFY_MASK,%edxjz 1fmovq %rsp,%rdi# &ptregs -> arg1xorl %esi,%esi# oldset -> arg2call do_notify_resume1:movl $_TIF_WORK_MASK,%ediint_restore_rest:RESTORE_RESTDISABLE_INTERRUPTS(CLBR_NONE)TRACE_IRQS_OFFjmp int_with_checkCFI_ENDPROCEND(system_call)