Linux内存都去哪了:(1)分析memblock在启动过程中对内存的影响

时间:2024-07-22 10:05:44

关键词:memblock、totalram_pages、meminfo、MemTotal、CMA等。

最近在做低成本方案,需要研究一整块RAM都用在哪里了?

最直观的的就是通过/proc/meminfo查看MemTotal,实际上可能远小于RAM物理大小。

这里有个明显的分界点:在free_initmem()之后,MemTotal也即totalram_pages就固定在一个值。这也是Linux可以支配的内存,这之外的内存称之为内存黑洞。

这里分析从RAM启动,到free_initmem(),然后进入shell看到MemTotal究竟是多少?为什么这样?

这中间涉及到memblock、内核代码段、页面Reseved属性、CMA等等。

1. memblock介绍

memblock内存管理机制用于在Linux启动后管理内存,一直到free_initmem()为止。

之后totalram_pages就稳定在一个数值。

1.1 memblock数据结构

struct memblock是memblock的核心数据结构,下面分为几种类型的memblock,每种类型memblock包含若干regions。

/* Definition of memblock flags. */
enum {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
}; struct memblock_region {
phys_addr_t base;--------------------------------region的基地址
phys_addr_t size;--------------------------------region的大小
unsigned long flags;-----------------------------region的标志,上面枚举体定义。
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
}; struct memblock_type {
unsigned long cnt; /* number of regions */---------------内存的regions数量。
unsigned long max; /* size of the allocated array */-----当前集合中记录内存区域最大大小。
phys_addr_t total_size; /* size of all regions */--------regions总大小。
struct memblock_region *regions;----------------------------指向regions数组。
}; struct memblock {
bool bottom_up; /* is bottom up direction? */-----表示分配器的分配方式,true表示从低地址向高地址分配,false则相反。
phys_addr_t current_limit;-------------------------内存块大小的限制。
struct memblock_type memory;-----------------------可用内存
struct memblock_type reserved;---------------------保留内存
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};

内核中的memblock实例,定义了初始值,这个全局变量在后面会被频繁使用。

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = , /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS, .reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = , /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
...
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};

1.2 memblock API介绍

memblock API主要有如下系列:

phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
bool memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
ulong choose_memblock_flags(void);

其中对不同类型memblock的分配释放主要有如下:其中memblock_add()和memblock_remove()是针对可用memlbock操作;memblock_reserve()和memblock_free()是针对reserved类型memblock操作。

int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
return memblock_add_range(&memblock.memory, base, size, nid, );
} int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, );
}
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return memblock_remove_range(&memblock.memory, base, size);
} int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
kmemleak_free_part_phys(base, size);
return memblock_remove_range(&memblock.reserved, base, size);
} int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, );
}

1.3 对memblock调试

如果需要了解memblock的详细分配流程,可以通过在bootargs中加入“memblock=debug”。

在内核启动后,通过/proc/kmsg查看调试信息。

查看内存地址范围和reserved区域可以通过:

/sys/kernel/debug/memblock/memory

/sys/kernel/debug/memblock/reserved

2. totalram_pages是如何更新的

内核中totalram_pages在初始值是0,在free_initmem()之后就稳定在一个数值。

totalram_pages从0开始,(1)内核首先遍历memblock.memory和memblock.reserved区域,判断出空闲区域内存大小;

(2)然后在CMA阶段,将CMA预留的内存释放出来,totalram_pages增加;

(3)最后在free_initmem()中释放init段占用的内存。

2.1 memblock reserve区域分配

在打开memblock调试之后,可以看到创建reserve类型regions的log。

[    0.000000] memblock_reserve: [0x00000000000000-0x000000007a15ff] flags 0x0 setup_arch+0x66/0x258----------------------------内核代码段
[ 0.000000] memblock_reserve: [0x00000080000000-0x0000007fffffff] flags 0x0 setup_arch+0x7e/0x258----------------------------initrd
[ 0.000000] memblock_reserve: [0x00000000027380-0x0000000002bfb3] flags 0x0 early_init_dt_reserve_memory_arch+0x1e/0x30------dtb自身,从内核中__dtb_xxx_begin开始到__dtb_xxx_end结束的空间。
[ 0.000000] memblock_reserve: [0x00000004000000-0x00000004ffffff] flags 0x0 early_init_dt_reserve_memory_arch+0x1e/0x30------驱动保留区域
[ 0.000000] memblock_reserve: [0x00000001000000-0x00000003ffffff] flags 0x0 early_init_dt_reserve_memory_arch+0x1e/0x30------驱动保留区域
[ 0.000000] memblock_reserve: [0x00000005000000-0x00000007ffffff] flags 0x0 early_init_dt_reserve_memory_arch+0x1e/0x30------cma保留区域
[ 0.000000] memblock_reserve: [0x0000000fdff000-0x0000000fffefff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdfefe0-0x0000000fdfefff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdfa380-0x0000000fdfefb3] flags 0x0 memblock_alloc_range_nid+0x60/0x7c
[ 0.000000] memblock_reserve: [0x0000000fdea3a4-0x0000000fdfa37f] flags 0x0 memblock_alloc_range_nid+0x60/0x7c
[ 0.000000] memblock_reserve: [0x0000000fde9000-0x0000000fde9fff] flags 0x0 __alloc_memory_core_early+0xa4/0xe8
[ 0.000000] memblock_reserve: [0x0000000fde8000-0x0000000fde8fff] flags 0x0 __alloc_memory_core_early+0xa4/0xe8
[ 0.000000] memblock_reserve: [0x0000000fdea340-0x0000000fdea38a] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea2e0-0x0000000fdea32a] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea280-0x0000000fdea2ca] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fde7000-0x0000000fde7fff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fddf000-0x0000000fde6fff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdfefc0-0x0000000fdfefc3] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea3a0-0x0000000fdea3a3] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea260-0x0000000fdea263] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea240-0x0000000fdea243] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea1c0-0x0000000fdea237] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdea180-0x0000000fdea1bb] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdde000-0x0000000fddefff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdbe000-0x0000000fdddfff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200
[ 0.000000] memblock_reserve: [0x0000000fdae000-0x0000000fdbdfff] flags 0x0 memblock_virt_alloc_internal+0x1a6/0x200

其中内核代码段、initrd、dtb以及dtb中reserved-memory对应如下:

static void __init csky_memblock_init(void)
{
unsigned long zone_size[MAX_NR_ZONES];
unsigned long zhole_size[MAX_NR_ZONES];
signed long size; memblock_reserve(__pa(_stext), _end - _stext);--------------------------将内核代码段设置为reserved类型memblock,其中的init段会在free_initmem()中返还给内核。
#ifdef CONFIG_BLK_DEV_INITRD
memblock_reserve(__pa(initrd_start), initrd_end - initrd_start);--------将内核initrd段设置为reserved类型memblock
#endif
early_init_fdt_reserve_self();------------------------------------------将dtb本身区域设置为reserved类型memblock
early_init_fdt_scan_reserved_mem();-------------------------------------将dtb中reserved-memory区域设置为reserved类型memblock,其中CMA区域会返还给内核。
...
}

上面的reserved信息可以在/sys/kernel/debug/memblock/reserved中找到对应信息。

下面的信息,将一些错误的regions剔除,部分regions进行了合并。

   : 0x00000000..0x007a15ff
: 0x01000000..0x07ffffff
: 0x0fdae000..0x0fde9fff
: 0x0fdea180..0x0fdea1bb
: 0x0fdea1c0..0x0fdea237
: 0x0fdea240..0x0fdea243
: 0x0fdea260..0x0fdea263
: 0x0fdea280..0x0fdea2ca
: 0x0fdea2e0..0x0fdea32a
: 0x0fdea340..0x0fdea38a
: 0x0fdea3a0..0x0fdfefb3
: 0x0fdfefc0..0x0fdfefc3
: 0x0fdfefe0..0x0fffefff

2.2 返还空闲页面给buddy allocator

内核在free_all_bootmem()中将memblock中reserved之外的regions返还给内核的buddy allocator使用。

unsigned long __init free_all_bootmem(void)
{
unsigned long pages; reset_all_zones_managed_pages(); pages =free_low_memory_core_early();
printk("totalram_pages: %lu %luKB %s:%d\n", totalram_pages, totalram_pages<<2, __func__, __LINE__);
totalram_pages += pages;
printk("totalram_pages: %lu %luKB %s:%d\n", totalram_pages, totalram_pages<<2, __func__, __LINE__); return pages;
} static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = ;
phys_addr_t start, end;
u64 i; memblock_clear_hotplug(, -); for_each_reserved_mem_region(i, &start, &end)----------------------------------------遍历memblock.reserved类型的regions。
reserve_bootmem_region(start, end);----------------------------------------------对每个regions设置页面属性为Reserved。 /*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)---------------------------------------------------------------------遍历所有在memblock.memory中,但是不在memblock.reserve中的regions。然后清Reserved页面属性。
count +=__free_memory_core(start, end); return count;
} void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);--------------------------------------------页面号向前推一页
unsigned long end_pfn = PFN_UP(end);--------------------------------------------------页面号向后推一页,这样确保start和end刚好在start_pfn和end_pfn中。 printk("totalram_pages: reserved 0x%08x-0x%08x 0x%08x-0x%08x %lu-%lu=%lu", start, end, PFN_PHYS(start_pfn), PFN_PHYS(end_pfn), end_pfn, start_pfn, end_pfn-start_pfn);
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); INIT_LIST_HEAD(&page->lru); SetPageReserved(page);--------------------------------------------------------设置页面属性为Reserved。
}
}
} #define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid) static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn); if (start_pfn > end_pfn)
return ; __free_pages_memory(start_pfn, end_pfn);
printk("totalram_pages: freeed 0x%08x-0x%08x 0x%08x-0x%08x %lu-%lu=%lu", start, end, PFN_PHYS(start_pfn), PFN_PHYS(end_pfn), end_pfn, start_pfn, end_pfn-start_pfn); return end_pfn - start_pfn;
} static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order; while (start < end) {
order = min(MAX_ORDER - 1UL, __ffs(start)); while (start + (1UL << order) > end)
order--; __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order);
}
} void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
return__free_pages_boot_core(page, order);
} static void __init __free_pages_boot_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = << order;
struct page *p = page;
unsigned int loop; prefetchw(p);
for (loop = ; loop < (nr_pages - ); loop++, p++) {
prefetchw(p + );
__ClearPageReserved(p);
set_page_count(p, );
}
__ClearPageReserved(p);-------------------------------------------------对空闲页面,清Reserved页面属性。
set_page_count(p, ); page_zone(page)->managed_pages += nr_pages;
set_page_refcounted(page);
__free_pages(page, order);
}

经过free_low_mem_core_early()之后,系统更新了totalram_pages。

[    0.000000] totalram_pages: reserved 0x00000000-0x007a15ff 0x00000000-0x007a2000 -=1954-----------------------内核代码段
[ 0.000000] totalram_pages: reserved 0x01000000-0x07ffffff 0x01000000-0x08000000 -=28672------------------reserved-memory段
[ 0.000000] totalram_pages: reserved 0x0fdae000-0x0fde9fff 0x0fdae000-0x0fdea000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea180-0x0fdea1bb 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea1c0-0x0fdea237 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea240-0x0fdea243 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea260-0x0fdea263 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea280-0x0fdea2ca 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea2e0-0x0fdea32a 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea340-0x0fdea38a 0x0fdea000-0x0fdeb000 -=
[ 0.000000] totalram_pages: reserved 0x0fdea3a0-0x0fdfefb3 0x0fdea000-0x0fdff000 -=
[ 0.000000] totalram_pages: reserved 0x0fdfefc0-0x0fdfefc3 0x0fdfe000-0x0fdff000 -=
[ 0.000000] totalram_pages: reserved 0x0fdfefe0-0x0fffefff 0x0fdfe000-0x0ffff000 -=513---------------------reserved页面数量为:1954+28672+60+21+513-1(21和513重合)=31219个页面。
[ 0.000000] totalram_pages: freeed 0x007a1600-0x01000000 0x007a2000-0x01000000 -=
[ 0.000000] totalram_pages: freeed 0x08000000-0x0fdae000 0x08000000-0x0fdae000 -=
[ 0.000000] totalram_pages: freeed 0x0fdea000-0x0fdea180 0x0fdea000-0x0fdea000 -=0--------------------------空闲一共34316个页面。
[ 0.000000] totalram_pages: 0KB free_all_bootmem:
[ 0.000000] totalram_pages: 137264KB free_all_bootmem:

从上面log可知,reserve一共31219个页面,空闲一共34316个页面,合计65535个页面。

和/sys/kernel/debug/memblock/memory得到的内容一致。

: 0x00000000..0x0fffefff

2.3 CMA返还页面给buddy allocator

cma_init_reserved_areas()中初始化CMA区域,将这个区域和系统buddy allocator复用。

static int __init cma_init_reserved_areas(void)
{
int i; for (i = ; i < cma_area_count; i++) {
int ret = cma_activate_area(&cma_areas[i]); if (ret)
return ret;
} return ;
} static int __init cma_activate_area(struct cma *cma)
{
int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;----------------------------------------cma->count表示当前cma区域页面数,pageblock_order表示pageblock的阶数。这里将cma区域按pageblock划分。
struct zone *zone; cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!cma->bitmap)
return -ENOMEM;
printk("totalram_pages: cma active 0x%08x-0x%08x count=%lu\n", PFN_PHYS(cma->base_pfn),
PFN_PHYS(cma->base_pfn + cma->count),
cma->count);
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn)); do {
unsigned j; base_pfn = pfn;
for (j = pageblock_nr_pages; j; --j, pfn++) {----------------------------------对当前pageblock中每个页面进行有效性检查。因为MAX_ORDER为11,所以pageblock_order为10,pageblock_nr_pages为1024个页面。
WARN_ON_ONCE(!pfn_valid(pfn));
if (page_zone(pfn_to_page(pfn)) != zone)
goto err;
}
init_cma_reserved_pageblock(pfn_to_page(base_pfn));----------------------------遍历当前pageblock,进行属性设置。
} while (--i);---------------------------------------------------------------------按pageblock为单位遍历。 mutex_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
spin_lock_init(&cma->mem_head_lock);
#endif return ; err:
kfree(cma->bitmap);
cma->count = ;
return -EINVAL;
}
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
struct page *p = page; do {
__ClearPageReserved(p);--------------------------------------------------------将pageblock中所有page清Reserved属性。
set_page_count(p, );
} while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA);--------------------------------------设置当前pageblock属性为MIGRATE_CMA,只有可移动页面才可以在此pageblock申请内存。在CMA申请的时候,可以被移出,进而达到复用的目的。 if (pageblock_order >= MAX_ORDER) {
i = pageblock_nr_pages;
p = page;
do {
set_page_refcounted(p);
__free_pages(p, MAX_ORDER - );
p += MAX_ORDER_NR_PAGES;
} while (i -= MAX_ORDER_NR_PAGES);
} else {
set_page_refcounted(page);
__free_pages(page, pageblock_order);
} adjust_managed_page_count(page, pageblock_nr_pages);--------------------------------将复用的页面数返还给totalram_pages。
} void adjust_managed_page_count(struct page *page, long count)
{
spin_lock(&managed_page_count_lock);
page_zone(page)->managed_pages += count;
totalram_pages += count;
printk("totalram_pages: %lu %luKB %s:%d count=%ld\n", totalram_pages, totalram_pages<<, __func__, __LINE__, count);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages += count;
#endif
spin_unlock(&managed_page_count_lock);
}

从下面的log可以看出,cma以一个pageblock为单位返还,每个1024个页面;共12288个页面。

[    0.000000] totalram_pages:  0KB free_all_bootmem:
[ 0.000000] totalram_pages: 137264KB free_all_bootmem:
[ 0.041129] totalram_pages: cma active 0x05000000-0x08000000 count=
[ 0.041316] totalram_pages: 141360KB adjust_managed_page_count: count=
[ 0.041495] totalram_pages: 145456KB adjust_managed_page_count: count=
[ 0.041672] totalram_pages: 149552KB adjust_managed_page_count: count=
[ 0.041848] totalram_pages: 153648KB adjust_managed_page_count: count=
[ 0.042027] totalram_pages: 157744KB adjust_managed_page_count: count=
[ 0.042206] totalram_pages: 161840KB adjust_managed_page_count: count=
[ 0.042384] totalram_pages: 165936KB adjust_managed_page_count: count=
[ 0.042562] totalram_pages: 170032KB adjust_managed_page_count: count=
[ 0.042740] totalram_pages: 174128KB adjust_managed_page_count: count=
[ 0.042920] totalram_pages: 178224KB adjust_managed_page_count: count=
[ 0.043096] totalram_pages: 182320KB adjust_managed_page_count: count=
[ 0.043275] totalram_pages: 186416KB adjust_managed_page_count: count=

2.4 init内存释放

在内核initcall完成之后,调用free_initmem()释放相关内存。

void free_initmem(void)
{
unsigned long addr; addr = (unsigned long) &__init_begin;
while (addr < (unsigned long) &__init_end) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
free_page(addr);
totalram_pages++;
printk("totalram_pages: %lu %luKB %s:%d\n", totalram_pages, totalram_pages<<, __func__, __LINE__);
addr += PAGE_SIZE;
}
pr_info("Freeing unused kernel memory: %dk freed\n",
((unsigned int)&__init_end - (unsigned int)&__init_begin) >> );
}

2.5 totalram_pages小结

2.5.1 释放init内存

init占用的内存从__init_begin开始到__init_end结束。

下面是一个实例,各个flags关系如下:_start/_stext/__init_begin < __sinittext < _einittext < __init_end/_text < _etext < _sdata/__start_rodata < __end_rodata < _edata < __bss_start < __bss_stop < _end。

可以看出整个内核可以分为三部分,其中init段在text段中。

Linux内存都去哪了:(1)分析memblock在启动过程中对内存的影响

从free_initmem()可知,释放的是__init_begin到__init_end这块内存,即上图红框部分。

void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size; physpages = get_num_physpages();
codesize = _etext - _stext;
datasize = _edata - _sdata;
rosize = __end_rodata - __start_rodata;
bss_size = __bss_stop - __bss_start;
init_data_size = __init_end - __init_begin;
init_code_size = _einittext - _sinittext; #define adj_init_size(start, end, size, pos, adj) \
do { \
if (start <= pos && pos < end && size > adj) \
size -= adj; \
} while () adj_init_size(__init_begin, __init_end, init_data_size,
_sinittext, init_code_size);---------------------------------------------如果_sinittext在__init_begin-__init_end范围内,且init_data_size大于init_code_size,从中扣除_sinittext开始的init_code_size长度的段。
adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);--------------从codeseize中扣除init代码段。
adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);------------__init_begin地址不在范围内,所以不扣除。
adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);------------------__start_rodata不在_stext和_etext之间,所以不扣除。
adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);------------------从datasize中扣去rosize,剩下的就是可读写的大小。 #undef adj_init_size pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
"%s%s)\n",
nr_free_pages() << (PAGE_SHIFT - ),
physpages << (PAGE_SHIFT - ),
codesize >> , datasize >> , rosize >> ,---------------------------------------------------这里的codesize意义不太大,因为扣除了init代码段,但是init数据段还在里面。
(init_data_size + init_code_size) >> , bss_size >> ,----------------------------------------因为已经从init_data_size中扣除init_code_size,所以init_code_size+init_data_size也就是__init_begin到__init_end大小。这部分会被释放。
(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - ),
totalcma_pages << (PAGE_SHIFT - ),
#ifdef CONFIG_HIGHMEM
totalhigh_pages << (PAGE_SHIFT - ),
#endif
str ? ", " : "", str ? str : "");
}

每次释放一个页面,一共返还了3188KB,即797个页面。

[    0.000000] Memory: 137264K/262140K available (6304K kernel code, 225K rwdata, 892K rodata, 3188K init, 242K bss, 75724K reserved, 49152K cma-reserved, 0K highmem)

最终totalram_pages应该是:34316+12288+797=47401,也即189604KB大小。

然后在shell中查看/proc/meminfo,吻合。

MemTotal:          kB
MemFree: kB
MemAvailable: kB
Buffers: kB
Cached: kB
...

所以MemTotal之外的内存黑洞主要包括以下内容:

  • 内核代码段,除了init段
  • dtb占用内存
  • dtb中reserved-memory段,除CMA部分
  • 系统共用部分(待分析)

2.6 遍历所有页面检查PG_reserved属性

通过遍历内核所有页面的属性,然后判断reserved和free区域。

free区域之和应该是totalram_pages,reserved是留给内核代码段和页面符号表等信息。

diff --git a/init/main.c b/init/main.c
index 8b52d9a..78e31e1
--- a/init/main.c
+++ b/init/main.c
@@ -, +, @@ static inline void mark_readonly(void)
}
#endif +void interate_pages_reserved(void)
+{
+ unsigned int i = , total_pages = node_present_pages(), is_reserved = , region_start = , region_end = , total_reserved = , total_free = ;
+ struct page *page;
+
+ printk("Reserved vs free of %u pages.\n pagenum phyaddr type\n", total_pages);
+ for(i = ; i < total_pages; i++)
+ {
+ page = pfn_to_page(i);
+ if(i == )
+ {
+ is_reserved = test_bit(PG_reserved, &(page->flags));
+ }
+ else if(is_reserved != test_bit(PG_reserved, &(page->flags)))
+ {
+ region_end = i;
+ printk(" % 5u-% 5u 0x%08x-0x%08x %s\n", region_start, region_end-, region_start<<, (region_end<<)-, is_reserved?"reserved":"free");
+ if(is_reserved)
+ total_reserved += region_end - region_start;
+ else
+ total_free += region_end - region_start;
+ is_reserved = test_bit(PG_reserved, &(page->flags));
+ region_start = i;
+ }
+ }
+ region_end = i;
+ printk(" % 5u-% 5u 0x%08x-0x%08x %s\n", region_start, region_end-, region_start<<, (region_end<<)-, is_reserved?"reserved":"free");
+ printk("Summary: %u page, %u KB reserved; %u page, %u KB free.\n", total_reserved, total_reserved<<, total_free, total_free<<);
+}
static int __ref kernel_init(void *unused)
{
int ret;
@@ -, +, @@ static int __ref kernel_init(void *unused)
free_initmem();
#ifdef CONFIG_PERF_TIMER
printk(KERN_ALERT "PERF % 9u: kernel init done.\n", perf_timer_read_us());
+interate_pages_reserved();
#endif
mark_readonly();
system_state = SYSTEM_RUNNING;

interate_pages_reserved()输出如下:

[    0.652687] Reserved vs free of  pages.
[ 0.652687] pagenum phyaddr type
[ 0.652718] - 0x00000000-0x004adfff free----------------------从dmesg中内核代码段init为4792KB,即1198个页面。
[ 0.652762] - 0x004ae000-0x00931fff reserved------------------reserved1
[ 0.652826] - 0x00932000-0x00ffffff free
[ 0.653227] - 0x01000000-0x03ffffff reserved------------------reserved2
[ 0.654785] - 0x04000000-0x0fdadfff free
[ 0.654811] - 0x0fdae000-0x0fffefff reserved------------------reserved3
[ 0.654817] Summary: page, KB reserved; page, KB free.

dmsg显示内核代码段组成:

[    0.000000] Memory: 152048K/262140K available (7909K kernel code, 221K rwdata, 892K rodata, 4792K init, 242K bss, 60940K reserved, 49152K cma-reserved, 0K highmem)

分别查看meminfo、reserved、memory三者信息:

# cat /proc/meminfo
MemTotal: kB
MemFree: kB
MemAvailable: kB
... # cat /sys/kernel/debug/memblock/reserved
: 0x00000000..0x009315ff------------------------返还init部分给系统,剩下部分和reserved1吻合。
: 0x01000000..0x03ffffff------------------------和reserved2区域吻合
: 0x05000000..0x07ffffff------------------------CMA区域返还给系统使用
: 0x0fdae000..0x0fde9fff------------------------和reserved3区域吻合
: 0x0fdea360..0x0fdea39b
: 0x0fdea3a0..0x0fdea417
: 0x0fdea420..0x0fdea423
: 0x0fdea440..0x0fdea443
: 0x0fdea460..0x0fdea463
: 0x0fdea480..0x0fdea4bb
: 0x0fdea4c0..0x0fdea4fb
: 0x0fdea500..0x0fdea53b
: 0x0fdea540..0x0fdea543
: 0x0fdea54c..0x0fdfefd3
: 0x0fdfefe0..0x0fffefff
# cat /sys/kernel/debug/memblock/memory
: 0x00000000..0x0fffefff--------------------------总的RAM区域吻合。

3. 有什么用?

上面对totalram_pages的来历进行了分析,以及这之外都哪些部分占用了多少内存。

总内存使用角度来看,肯定希望totalram_pages最大化。

那么就需要削减内核代码段、dtb大小等空间,驱动中尽量减少内存独占区域,提倡使用CMA这种复用技术。