接上篇 https://www.daodaodao123.com/?p=776
本篇解析缺页异常分支之一,匿名页面;
1.缺页中断触发条件
(1)pte页表项的PRESENT没有置位 (2)pte表项为空 (3)vma->vm_ops->fault为空(对于私有的匿名页)
2.应用场景
(1)局部变量较大,函数调用较深进行栈扩展。 (2)malloc从堆空间分配内存,只是分配了虚拟内存空间,没有映射到物理页,第一次访问时发生。 (3)mmap分配匿名页,第一次访问,只是分配了虚拟内存空间,没有映射到物理页,第一次访问时发生。
3.linux内存映射的两个规律
规律1:
mmap等内存映射,会将私有的vma映射设置为只读; 参考mm/mmap.c
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
pgprot_val(arch_vm_get_page_prot(vm_flags)));
return arch_filter_pgprot(ret);
}
接
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
接
#define __P000 PAGE_NONE
#define __P001 PAGE_READONLY
#define __P010 PAGE_READONLY
#define __P011 PAGE_READONLY
#define __P100 PAGE_EXECONLY
#define __P101 PAGE_READONLY_EXEC
#define __P110 PAGE_READONLY_EXEC
#define __P111 PAGE_READONLY_EXEC
#define __S000 PAGE_NONE
#define __S001 PAGE_READONLY
#define __S010 PAGE_SHARED
#define __S011 PAGE_SHARED
#define __S100 PAGE_EXECONLY
#define __S101 PAGE_READONLY_EXEC
#define __S110 PAGE_SHARED_EXEC
#define __S111 PAGE_SHARED_EXEC
规律2
共享的匿名映射走shmem ,变成shmem的文件映射. 参见mm/mmap.c-->mmap_region()
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
...
if (file) { ///文件映射
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
addr = vma->vm_start;
...
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) { ///共享映射
error = shmem_zero_setup(vma); ///共享匿名映射,关联shmem的vma操作(ipc共享内存一样)
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma); ///私有匿名映射
}
...
}
4.私有匿名缺页
私有匿名缺页,处理流程图如下:
源码解析:
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED) ///防止共享的vma进入匿名页面的缺页中断,本函数只处理私有匿名映射
return VM_FAULT_SIGBUS;
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
* Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(vma->vm_mm, vmf->pmd)) ///分配pte页表并填充到pmd
return VM_FAULT_OOM;
/* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
///处理分配页面只读情况,系统返回零页
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), ///my_zero_pfn获取零页的页帧号
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, ///获取pte页表项,同时获取锁保护
vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte; ///写情况处理完,跳转setpte
}
///处理vma可写情况
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma))) ///为建立rmap做准备
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, vmf->address); ///分配一个可移动的匿名物理页面,优先使用高端内存(arm64不存在高端内存)
if (!page)
goto oom;
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page); ///添加内存屏障
entry = mk_pte(page, vma->vm_page_prot); ///创建一个pte页表项
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry)); ///设置可写标记
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, ///获取pte页表项,并获得自旋锁,保证不被锁和打断
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
update_mmu_cache(vma, vmf->address, vmf->pte);
goto release;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); ///增加进程匿名页计数
page_add_new_anon_rmap(page, vma, vmf->address, false); ///匿名页面添加到rmap系统
lru_cache_add_inactive_or_unevictable(page, vma); ///匿名页面添加到lru
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); ///填写页表项到硬件页表
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
put_page(page);
goto unlock;
oom_free_page:
put_page(page);
oom:
return VM_FAULT_OOM;
}
5.系统零页
补充一个说明,系统零页,一个只读的物理页面,内容是全零;在一些只读,且要求内容为0的场景,不分配物理页面,全部都映射到系统零页,可以节省大量内存,且大幅提高性能。常用的一个场景,写时复制;
应用程序使用malloc()分配虚拟内存后,三种情况: (1)直接读,linux内核进入缺页异常,调用do_anonymous_page函数使用零页映射,此时PTE属性只读; (2)先读后写,linux内核第一次触发缺页异常,映射零页;第二次触发异常,触发写时复制; (3)直接写,linux内核进入匿名页面的缺页异常,调用alloc_zeroed_user_highpage_movable分配一个新页面,这个PTE是可写的;