linux源码解析08–缺页异常之匿名页面

时间:2023-02-13 13:05:57

接上篇 https://www.daodaodao123.com/?p=776

本篇解析缺页异常分支之一,匿名页面

1.缺页中断触发条件

(1)pte页表项的PRESENT没有置位 (2)pte表项为空 (3)vma->vm_ops->fault为空(对于私有的匿名页)

2.应用场景

(1)局部变量较大,函数调用较深进行栈扩展。 (2)malloc从堆空间分配内存,只是分配了虚拟内存空间,没有映射到物理页,第一次访问时发生。 (3)mmap分配匿名页,第一次访问,只是分配了虚拟内存空间,没有映射到物理页,第一次访问时发生。

3.linux内存映射的两个规律

规律1:

mmap等内存映射,会将私有的vma映射设置为只读; 参考mm/mmap.c

pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
	pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
			pgprot_val(arch_vm_get_page_prot(vm_flags)));

	return arch_filter_pgprot(ret);
}

pgprot_t protection_map[16] __ro_after_init = {
	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

#define __P000  PAGE_NONE
#define __P001  PAGE_READONLY
#define __P010  PAGE_READONLY
#define __P011  PAGE_READONLY
#define __P100  PAGE_EXECONLY
#define __P101  PAGE_READONLY_EXEC
#define __P110  PAGE_READONLY_EXEC
#define __P111  PAGE_READONLY_EXEC

#define __S000  PAGE_NONE
#define __S001  PAGE_READONLY
#define __S010  PAGE_SHARED
#define __S011  PAGE_SHARED
#define __S100  PAGE_EXECONLY
#define __S101  PAGE_READONLY_EXEC
#define __S110  PAGE_SHARED_EXEC
#define __S111  PAGE_SHARED_EXEC

规律2

共享匿名映射走shmem ,变成shmem的文件映射. 参见mm/mmap.c-->mmap_region()

unsigned long mmap_region(struct file *file, unsigned long addr,
		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
		struct list_head *uf)
{
...
	if (file) {   ///文件映射  
		vma->vm_file = get_file(file);
		error = call_mmap(file, vma);
		if (error)
			goto unmap_and_free_vma;
		addr = vma->vm_start;
...
		vm_flags = vma->vm_flags;
	} else if (vm_flags & VM_SHARED) {   ///共享映射
		error = shmem_zero_setup(vma);   ///共享匿名映射,关联shmem的vma操作(ipc共享内存一样)
		if (error)
			goto free_vma;
	} else {
		vma_set_anonymous(vma);  ///私有匿名映射
	}

...
}

4.私有匿名缺页

私有匿名缺页,处理流程图如下: linux源码解析08–缺页异常之匿名页面

源码解析:

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	struct page *page;
	vm_fault_t ret = 0;
	pte_t entry;

	/* File mapping without ->vm_ops ? */
	if (vma->vm_flags & VM_SHARED)    ///防止共享的vma进入匿名页面的缺页中断,本函数只处理私有匿名映射
		return VM_FAULT_SIGBUS;

	/*
	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
	 * pte_offset_map() on pmds where a huge pmd might be created
	 * from a different thread.
	 *
	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
	 * parallel threads are excluded by other means.
	 *
	 * Here we only have mmap_read_lock(mm).
	 */
	if (pte_alloc(vma->vm_mm, vmf->pmd)) ///分配pte页表并填充到pmd
		return VM_FAULT_OOM;

	/* See comment in handle_pte_fault() */
	if (unlikely(pmd_trans_unstable(vmf->pmd)))
		return 0;

///处理分配页面只读情况,系统返回零页
	/* Use the zero-page for reads */
	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
			!mm_forbids_zeropage(vma->vm_mm)) {
		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), ///my_zero_pfn获取零页的页帧号
						vma->vm_page_prot));
		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,  ///获取pte页表项,同时获取锁保护
				vmf->address, &vmf->ptl);
		if (!pte_none(*vmf->pte)) {
			update_mmu_tlb(vma, vmf->address, vmf->pte);
			goto unlock;
		}
		ret = check_stable_address_space(vma->vm_mm);
		if (ret)
			goto unlock;
		/* Deliver the page fault to userland, check inside PT lock */
		if (userfaultfd_missing(vma)) {
			pte_unmap_unlock(vmf->pte, vmf->ptl);
			return handle_userfault(vmf, VM_UFFD_MISSING);
		}
		goto setpte;  ///写情况处理完,跳转setpte
	}

///处理vma可写情况
	/* Allocate our own private page. */
	if (unlikely(anon_vma_prepare(vma)))  ///为建立rmap做准备
		goto oom;
	page = alloc_zeroed_user_highpage_movable(vma, vmf->address);  ///分配一个可移动的匿名物理页面,优先使用高端内存(arm64不存在高端内存)
	if (!page)
		goto oom;

	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
		goto oom_free_page;
	cgroup_throttle_swaprate(page, GFP_KERNEL);

	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * preceding stores to the page contents become visible before
	 * the set_pte_at() write.
	 */
	__SetPageUptodate(page); ///添加内存屏障

	entry = mk_pte(page, vma->vm_page_prot);  ///创建一个pte页表项
	if (vma->vm_flags & VM_WRITE)
		entry = pte_mkwrite(pte_mkdirty(entry));  ///设置可写标记

	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,  ///获取pte页表项,并获得自旋锁,保证不被锁和打断
			&vmf->ptl);
	if (!pte_none(*vmf->pte)) {
		update_mmu_cache(vma, vmf->address, vmf->pte);
		goto release;
	}

	ret = check_stable_address_space(vma->vm_mm);
	if (ret)
		goto release;

	/* Deliver the page fault to userland, check inside PT lock */
	if (userfaultfd_missing(vma)) {
		pte_unmap_unlock(vmf->pte, vmf->ptl);
		put_page(page);
		return handle_userfault(vmf, VM_UFFD_MISSING);
	}

	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);             ///增加进程匿名页计数
	page_add_new_anon_rmap(page, vma, vmf->address, false);    ///匿名页面添加到rmap系统
	lru_cache_add_inactive_or_unevictable(page, vma);          ///匿名页面添加到lru
setpte:
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);   ///填写页表项到硬件页表

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
	pte_unmap_unlock(vmf->pte, vmf->ptl);
	return ret;
release:
	put_page(page);
	goto unlock;
oom_free_page:
	put_page(page);
oom:
	return VM_FAULT_OOM;
}

5.系统零页

补充一个说明,系统零页,一个只读的物理页面,内容是全零;在一些只读,且要求内容为0的场景,不分配物理页面,全部都映射到系统零页,可以节省大量内存,且大幅提高性能。常用的一个场景,写时复制;

应用程序使用malloc()分配虚拟内存后,三种情况: (1)直接读,linux内核进入缺页异常,调用do_anonymous_page函数使用零页映射,此时PTE属性只读; (2)先读后写,linux内核第一次触发缺页异常,映射零页;第二次触发异常,触发写时复制; (3)直接写,linux内核进入匿名页面的缺页异常,调用alloc_zeroed_user_highpage_movable分配一个新页面,这个PTE是可写的;