linux内核分析之进程地址空间管理

时间:2025-03-12 14:35:31

1、struct task_struct

进程内核栈是操作系统为管理每一个进程而分配的一个4k或者8k内存大小的一片内存区域,里面存放了一个进程的所有信息,它能够完整的描述一个正在执行的程序:它打开的文件,进程的地址空间,挂起的信号,进程的状态,从task_struct中可以看到对一个正在执行的程序的完整描述。

  linux内核分析之进程地址空间管理

进程描述符:

 struct thread_info {
struct task_struct *task; /* main task structure */
unsigned long flags;
struct exec_domain *exec_domain; /* execution domain */
int preempt_count; /* 0 => preemptable, <0 => BUG */
__u32 cpu; /* should always be 0 on m68k */
struct restart_block restart_block;
};

  task_struct:

 struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace; int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
int oncpu;
#endif
#endif
int load_weight; /* for niceness load balancing purposes */
int prio, static_prio, normal_prio;
struct list_head run_list;
struct prio_array *array; unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
enum sleep_type sleep_type; unsigned int policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif struct list_head tasks;
/*
* ptrace_list/ptrace_children forms the list of my children
* that were stolen by a ptracer.
*/
struct list_head ptrace_children;
struct list_head ptrace_list; struct mm_struct *mm, *active_mm; /* task state */
struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned int personality;
unsigned did_exec:;
pid_t pid;
pid_t tgid; #ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->parent->pid)
*/
struct task_struct *real_parent; /* real parent process (when being debugged) */
struct task_struct *parent; /* parent process */
/*
* children/sibling forms the list of my children plus the
* tasks I'm ptracing.
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */ /* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group; struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ unsigned int rt_priority;
cputime_t utime, stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt; cputime_t it_prof_expires, it_virt_expires;
unsigned long long it_sched_expires;
struct list_head cpu_timers[]; /* process credentials */
uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid;
struct group_info *group_info;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
unsigned keep_capabilities:;
struct user_struct *user;
#ifdef CONFIG_KEYS
struct key *request_key_auth; /* assumed request_key authority */
struct key *thread_keyring; /* keyring private to this thread */
unsigned char jit_keyring; /* default keyring to attach requested keys to */
#endif
/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by flush_old_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand; sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
struct sigpending pending; unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask; void *security;
struct audit_context *audit_context;
seccomp_t seccomp; /* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock; /* Protection of the PI data structures: */
spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif #ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
int hardirqs_enabled;
unsigned long hardirq_enable_ip;
unsigned int hardirq_enable_event;
unsigned long hardirq_disable_ip;
unsigned int hardirq_disable_event;
int softirqs_enabled;
unsigned long softirq_disable_ip;
unsigned int softirq_disable_event;
unsigned long softirq_enable_ip;
unsigned int softirq_enable_event;
int hardirq_context;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 30UL
u64 curr_chain_key;
int lockdep_depth;
struct held_lock held_locks[MAX_LOCK_DEPTH];
unsigned int lockdep_recursion;
#endif /* journalling filesystem info */
void *journal_info; /* stacked block device info */
struct bio *bio_list, **bio_tail; /* VM state */
struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
/*
* current io wait handle: wait queue entry to use for io waits
* If this thread is processing aio, this points at the waitqueue
* inside the currently handled kiocb. It may be NULL (i.e. default
* to a stack based synchronous wait) if its doing sync IO.
*/
wait_queue_t *io_wait;
#ifdef CONFIG_TASK_XACCT
/* i/o counters(bytes read/written, #syscalls */
u64 rchar, wchar, syscr, syscw;
#endif
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_stimexpd;/* stime since last update */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
#ifdef CONFIG_CPUSETS
struct cpuset *cpuset;
nodemask_t mems_allowed;
int cpuset_mems_generation;
int cpuset_mem_spread_rotor;
#endif
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache; atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu; /*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
};

thread_info->task_struct->struct mm_struct

 struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
struct vm_area_struct * mmap_cache; /* last find_vma result */
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
unsigned long mmap_base; /* base of mmap area */
unsigned long task_size; /* size of task vm space */
unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
spinlock_t page_table_lock; /* Protects page tables and some counters */ struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung //所有mm_struct形成的链表
* together off init_mm.mmlist, and are protected
* by mmlist_lock
*/ /* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
mm_counter_t _file_rss;                      
mm_counter_t _anon_rss;                       unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm, locked_vm, shared_vm, exec_vm;
unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
unsigned long start_code, end_code, start_data, end_data; //进程代码段、数据段的开始结束地址
unsigned long start_brk, brk, start_stack; //进程堆的首地址、堆的尾地址、进程栈的首地址
unsigned long arg_start, arg_end, env_start, env_end;   //进程命令行参数的首地址、命令行参数的尾地址、环境变量的首地址、环境变量的尾地址 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ cpumask_t cpu_vm_mask; /* Architecture-specific MM context */
mm_context_t context; /* Swap token stuff */
/*
* Last value of global fault stamp as seen by this process.
* In other words, this value gives an indication of how long
* it has been since this task got the token.
* Look at mm/thrash.c
*/
unsigned int faultstamp;
unsigned int token_priority;
unsigned int last_interval; unsigned char dumpable:; /* coredumping support */
int core_waiters;
struct completion *core_startup_done, core_done; /* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
};

   进程内存区域可以包含各种内存对象,从上面的mm_struct结构体中我们就可以看出来,主要有:

  1、可执行文件代码的内存映射,称为代码段

  2、可执行文件的已初始化全局变量的内存映射,称为数据段

  3、包含未初始化的全局变量,就是bass段的零页的内存映射

  4、用于进程用户空间栈的零页内存映射,这里不要和进程内核栈混淆,进程的内核栈独立存在并由内核维护,因为内核管理着所有进程。前面讲述的内核栈存放着管理所有进程的信息,所以内核管理着内核栈,内核栈管理着进程。

  5、每一个诸如c库或者动态链接库等共享库的代码段、数据段和bss也会被载入进程的地址空间

  6、任何内存映射文件

  7、任何共享内存段

  8、任何匿名的内存映射,比如malloc分配的内存

  上面讲了很多我们可以将进程的地址空间理解为程序在操作系统上运行时的一个内存空间,在《程序员的自我修养》中作者这样讲,程序和进程就好比人炒菜,程序就是菜谱,人就是cpu,整个炒菜的过程就是进程。可见我们写的代码,代码里面定义的变量全局的,静态的,申请的内存这些没有运行时都会存储在硬盘里面,一旦代码运行起来就是一个进程,就会形成一个新的进程空间,也就是一块内存空间(此处的内存空间不是我们的硬盘空间而是内存RAM,可以立即运行代码的空间)

  进程或者运行着的代码在linux或者unix系统中的进程空间一般按如下方式安排:

  linux内核分析之进程地址空间管理

  相当于前面说了这么多就是用内核栈来管理这个进程地址空间里面的进程执行的情况,这也就是c程序的存储空间布局。

  代码:这是由cpu执行的机器指令部分,通常代码部分常常只是只读的,以防止程序由于意外而修改其指令

  初始化数据段:通常将此段称为数据段,它包含了程序中需要明确地赋初值的变量,c程序中任何函数之外的声明,如 int a=9;使此变量以其初值存放在初始化数据段中。

  未初始化数据段:通常将此段称为bss段,在程序开始执行之前,内核将此段中的数据初始化为0或空指针。如函数外的声明:long sum[1000],使此变量存放在非初始化数据段中

  栈:栈非常重要,我们的函数调用就离不开栈对其数据进行保存。自动变量以及每次函数调用时所需保存的信息都存放在此段中。每次函数调用时,其返回地址及调用者的环境信息(如某些机器寄存器的值)都存放在栈中。然后,最近被调用的函数在栈上为其自动和临时变量分配存储空间。通过以这种方式使用栈,c递归函数可以工作。递归函数每次调用自身时,就用一个新的栈帧,因此一次函数调用实例中的变量集不会影响另一次函数调用实例中的变量。

  堆:通常在堆中进行动态存储分配。由于历史上形成的惯例,堆位于未初始化数据段和栈之间。

  在C语言中我们经常会申请一个动态的存储空间,如malloc()、calloc()、realloc()。

  #include“stdlib.h”

  void *malloc(size_t size)

  void *calloc(size_t size)

  void *realloc(size_t size)

  void *free(void *ptr)

  这三个函数所返回的指针一定是适当对其的,可以使用于任何数据对象。malloc函数分配制定字节的存储区,此存储区的初始值不确定。

  在内存分配时我们使用了内存分配函数,这些内存分配函数是以库函数出现的我们常用的库有C标准库及linux中常用的glibc库,这些库都已经实现了对内存的分配的函数,如malloc,malloc函数是以brk()和mmap()系统调用实现的C语言库函数。在前面我们讲过所有的内存的分配都是由内核栈进行管理的,我们调用glibc库中的内存分配函数,系统会产生一个系统调用,内核会在管理的进程地址空间的堆里面进行动态内存的分配,这里就是通过malloc()函数调用brk()函数然后brk()函数进行系统调用来完成动态内存的分配。内核执行sys_brk(addr)函数进行内存分配。

  这里有比较困惑的地方,我们讲了内存分配,其实在用fork()函数产生一个新的进程或者线程时也会进行内存分配,我们的进程在执行过程中也会进行内存分配,这两者都是内核通过分配函数进行分配的,不同的是fork()函数产生的内存分配主要是分配每个进程都需要的内核栈、存放页表的内存、以及代码执行时需要的内存空间也就是我们所说的进程内存空间,总的来说就是调用fork()函数分配的内存主要是1、存放管理内存信息 2、进程执行时存放代码及数据就是我们前面画的进程存储空间。用malloc()函数分配的内存是在已经为进程分配好的内存空间中的堆里面分配一块内存区域用于存放数据。所以这两者都是通过内核进行内存分配用途却大不相同。

  下面讲一下在进程空间堆中分配动态内存的流程

    C库函数malloc()调用unix/linux系统调用函数brk(),brk()执行系统调用陷入到内核,内核执行sys_brk()函数,sys_brk()函数调用do_brk()进行内存分配

malloc()---------->brk()-----|----->sys_brk()----------->do_brk()------------>vma_merge()/kmem_cache_zalloc()

用户空间------>    |             内核空间

                系统调用---------->

    sys_brk()函数

 asmlinkage unsigned long sys_brk(unsigned long brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm; down_write(&mm->mmap_sem); if (brk < mm->end_code)
goto out; /*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
goto out; newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
goto set_brk; /* Always allow shrinking brk. */
if (brk <= mm->brk) {
if (!do_munmap(mm, newbrk, oldbrk-newbrk))
goto set_brk;
goto out;
} /* Check against existing mmap mappings. */
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
goto out; /* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
set_brk:
mm->brk = brk;
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
return retval;
}

    do_brk()函数

    

 unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
unsigned long flags;
struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error; len = PAGE_ALIGN(len);
if (!len)
return addr; if ((addr + len) > TASK_SIZE || (addr + len) < addr)
return -EINVAL; if (is_hugepage_only_range(mm, addr, len))
return -EINVAL; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = arch_mmap_check(addr, len, flags);
if (error)
return error; /*
* mlock MCL_FUTURE?
*/
if (mm->def_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
} /*
* mm->mmap_sem is required to protect against another thread
* changing the mappings in case we sleep.
*/
verify_mm_writelocked(mm); /*
* Clear old maps. this also does some error checking for us
*/
munmap_back:
vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
} /* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM; if (mm->map_count > sysctl_max_map_count)
return -ENOMEM; if (security_vm_enough_memory(len >> PAGE_SHIFT))
return -ENOMEM; /* Can we just expand an old private anonymous mapping? */
if (vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL))
goto out; /*
* create a vma struct for an anonymous mapping
*/
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
} vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = protection_map[flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
return addr;
}

    vma_merge()函数

 struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next; /*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
*/
if (vm_flags & VM_SPECIAL)
return NULL; if (prev)
next = prev->vm_next;
else
next = mm->mmap;
area = next;
if (next && next->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next; /*
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma)) {
/* cases 1, 6 */
vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
} else /* cases 2, 5, 7 */
vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL);
return prev;
} /*
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
else /* cases 3, 8 */
vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL);
return area;
} return NULL;
}

    kmem_cache_zalloc()函数

 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
{
void *ret = kmem_cache_alloc(c, flags);
if (ret)
memset(ret, , c->size); return ret;
}