专题：Linux内存管理专题

关键词：分配掩码、伙伴系统、水位(watermark)、空闲伙伴块合并。

我们知道Linux内存管理是以页为单位进行的，对内存的管理是通过伙伴系统进行。

从Linux内存管理框架图可知，页面分配器是其他林林总总内存操作的基础。

这也是为什么在介绍了《Linux内存管理 (1)物理内存初始化》、《Linux内存管理 (2)页表的映射过程》、《Linux内存管理 (3)内核内存的布局图》之后，紧接着就要弄明白页面分配器的原因。

1. 重要数据结构

1.1 页面分配掩码

alloc_pages是内核中常用的分配物理内存页面的接口函数，他有两个参数，其中一个就是分配掩码。

include\linux\gfp.h存放了GFP(Get Free Page)分配掩码，分配掩码可以分为两类：以__GFP_开头的分配掩码；以GFP_开头的一般是__GFP_的组合。

__GFP_掩码分为两大类：zone modifiers和action modifiers。

zone modifiers是掩码的低4位，用来指定从那个zone分配页面。

action modifiers定义了分配页面的属性

/* Plain integer GFP bitmasks. Do not use this directly. */

#define ___GFP_DMA        0x01u

#define ___GFP_HIGHMEM        0x02u

#define ___GFP_DMA32        0x04u

#define ___GFP_MOVABLE        0x08u

#define ___GFP_WAIT        0x10u

#define ___GFP_HIGH        0x20u

#define ___GFP_IO        0x40u

#define ___GFP_FS        0x80u

#define ___GFP_COLD        0x100u

#define ___GFP_NOWARN        0x200u

#define ___GFP_REPEAT        0x400u

#define ___GFP_NOFAIL        0x800u

#define ___GFP_NORETRY        0x1000u

#define ___GFP_MEMALLOC        0x2000u

#define ___GFP_COMP        0x4000u

#define ___GFP_ZERO        0x8000u

#define ___GFP_NOMEMALLOC    0x10000u

#define ___GFP_HARDWALL        0x20000u

#define ___GFP_THISNODE        0x40000u

#define ___GFP_RECLAIMABLE    0x80000u

#define ___GFP_NOTRACK        0x200000u

#define ___GFP_NO_KSWAPD    0x400000u

#define ___GFP_OTHER_NODE    0x800000u

#define ___GFP_WRITE        0x1000000u

/* If the above are modified, __GFP_BITS_SHIFT may need updating */

在实际使用中多使用GFP_开头的掩码：

/* This equals 0, but use constants in case they ever change */

#define GFP_NOWAIT    (GFP_ATOMIC & ~__GFP_HIGH)

/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */

#define GFP_ATOMIC    (__GFP_HIGH)

#define GFP_NOIO    (__GFP_WAIT)

#define GFP_NOFS    (__GFP_WAIT | __GFP_IO)

#define GFP_KERNEL    (__GFP_WAIT | __GFP_IO | __GFP_FS)

#define GFP_TEMPORARY    (__GFP_WAIT | __GFP_IO | __GFP_FS | \

             __GFP_RECLAIMABLE)

#define GFP_USER    (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)

#define GFP_HIGHUSER    (GFP_USER | __GFP_HIGHMEM)

#define GFP_HIGHUSER_MOVABLE    (GFP_HIGHUSER | __GFP_MOVABLE)

#define GFP_IOFS    (__GFP_IO | __GFP_FS)

#define GFP_TRANSHUGE    (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \

             __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \

             __GFP_NO_KSWAPD)

/*

 * GFP_THISNODE does not perform any reclaim, you most likely want to

 * use __GFP_THISNODE to allocate from a given node without fallback!

 */

#ifdef CONFIG_NUMA

#define GFP_THISNODE    (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)

#else

#define GFP_THISNODE    ((__force gfp_t)0)

#endif

/* This mask makes up all the page movable related flags */

#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)

/* Control page allocator reclaim behavior */

#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\

            __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\

            __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)

/* Control slab gfp mask during early boot */

#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))

/* Control allocation constraints */

#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */

#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some

   platforms, used as appropriate on others */

#define GFP_DMA        __GFP_DMA

/* 4GB DMA on some platforms */

#define GFP_DMA32    __GFP_DMA32

2. 伙伴系统分配内存

alloc_page-------------------------------分配单页

get_zeroed_page-->__get_free_pages

    alloc_pages--------------------------分配2^odrder个页面

        alloc_pages_node-----------------增加node id参数

            __alloc_pages

                __alloc_pages_node_mask--增加nodemaks参数

__alloc_pages_nodemask is the 'heart' of the zoned buddy allocator.

首先__alloc_pages_nodemask很重要，其次说明了这里的伙伴页面分配器是基于Zone的。

struct alloc_context是伙伴系统分配函数中用于保存相关参数的数据结构。

struct alloc_context {

    struct zonelist *zonelist;

    nodemask_t *nodemask;

    struct zone *preferred_zone;

    int classzone_idx;

    int migratetype;

    enum zone_type high_zoneidx;

};

这里的zonelist，已经通过node_zonelist(nid, gfp_mask)得到：zonelist=NODE_DATA(nid)->node_zonelists+gfp_zonelist(flags)

struct page *

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

            struct zonelist *zonelist, nodemask_t *nodemask)

{

    struct zoneref *preferred_zoneref;

    struct page *page = NULL;

    unsigned int cpuset_mems_cookie;

    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

    struct alloc_context ac = {

        .high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根据gfp_mask低4位，找到对应的zone_type。ZONE_NORMAL？ZONE_HIGHMEM？

        .nodemask = nodemask,

        .migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根据gfp_mask得出页面migratetype，是MIGRATE_RECLAIMABLE？MIGRATE_MOVABLE？

    };

    gfp_mask &= gfp_allowed_mask;

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))

        return NULL;

    /*

     * Check the zones suitable for the gfp_mask contain at least one

     * valid zone. It's possible to have an empty zonelist as a result

     * of GFP_THISNODE and a memoryless node

     */

    if (unlikely(!zonelist->_zonerefs->zone))

        return NULL;

    if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)

        alloc_flags |= ALLOC_CMA;

retry_cpuset:

    cpuset_mems_cookie = read_mems_allowed_begin();

    /* We set it here, as __alloc_pages_slowpath might have changed it */

    ac.zonelist = zonelist;

    /* The preferred zone is used for statistics later */

    preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,

                ac.nodemask ? : &cpuset_current_mems_allowed,

                &ac.preferred_zone);

    if (!ac.preferred_zone)

        goto out;

    ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);

    /* First allocation attempt */

    alloc_mask = gfp_mask|__GFP_HARDWALL;

    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------尝试分配物理页面

    if (unlikely(!page)) {

        /*

         * Runtime PM, block IO and its error handling path

         * can deadlock because I/O on the device might not

         * complete.

         */

        alloc_mask = memalloc_noio_flags(gfp_mask);

        page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失败，则在这里进行很多特殊场景的处理。

    }

    if (kmemcheck_enabled && page)

        kmemcheck_pagealloc_alloc(page, order, gfp_mask);

    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:

    /*

     * When updating a task's mems_allowed, it is possible to race with

     * parallel threads in such a way that an allocation can fail while

     * the mask is being updated. If a page allocation is about to fail,

     * check if the cpuset changed during allocation and if so, retry.

     */

    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

        goto retry_cpuset;--------------------------------------------------重试页面分配

    return page;

}

get_page_from_freelist遍历ac->zonelist中的zone，在里面寻找满足条件的zone，然后找到页面，返回。

static struct page *

get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

                        const struct alloc_context *ac)

{

    struct zonelist *zonelist = ac->zonelist;

    struct zoneref *z;

    struct page *page = NULL;

    struct zone *zone;

    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

    int zlc_active = ;        /* set if using zonelist_cache */

    int did_zlc_setup = ;        /* just call zlc_setup() one time */

    bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

                (gfp_mask & __GFP_WRITE);

    int nr_fair_skipped = ;

    bool zonelist_rescan;

zonelist_scan:-------------------------------------------------------------------开始检查ac->zonelist。

    zonelist_rescan = false;

    /*

     * Scan zonelist, looking for a zone with enough free.

     * See also __cpuset_node_allowed() comment in kernel/cpuset.c.

     */

    for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------从zonelist给定的ac->high_zoneidx开始查找，返回的是zone。

                                ac->nodemask) {

...-----------------------------------------------------------------------------一系列检查条件，不满足跳出当前for循环，进入下一个zone。满足的进入水位检查。

        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------这里的alloc_flags包含ALLOC_WMARK_LOW

        if (!zone_watermark_ok(zone, order, mark,-------------------------------所以此处会检查zone的低水位，不满足则进行检查，或者尝试zone_reclaim。

                       ac->classzone_idx, alloc_flags)) {

            int ret;

            /* Checked here to keep the fast path fast */

            BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

            if (alloc_flags & ALLOC_NO_WATERMARKS)

                goto try_this_zone;

...

            ret = zone_reclaim(zone, gfp_mask, order);-------------------------通过zone_reclaim进行一些页面回收

            switch (ret) {

...
                default:

                /* did we reclaim enough */

                if (zone_watermark_ok(zone, order, mark,

                        ac->classzone_idx, alloc_flags))---------------------再次检查水位是否满足

                    goto try_this_zone;

                /*

                 * Failed to reclaim enough to meet watermark.

                 * Only mark the zone full if checking the min

                 * watermark or if we failed to reclaim just

                 * 1<<order pages or else the page allocator

                 * fastpath will prematurely mark zones full

                 * when the watermark is between the low and

                 * min watermarks.

                 */

                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

                    ret == ZONE_RECLAIM_SOME)

                    goto this_zone_full;

                continue;

            }

        }

try_this_zone:---------------------------------------------------------------包括水位各种条件都满足之后，可以在此zone进行页面分配工作。

        page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------从zone中进行页面分配工作

                        gfp_mask, ac->migratetype);

        if (page) {

            if (prep_new_page(page, order, gfp_mask, alloc_flags))

                goto try_this_zone;

            return page;

        }

this_zone_full:

        if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

            zlc_mark_zone_full(zonelist, z);

    }

    /*

     * The first pass makes sure allocations are spread fairly within the

     * local node.  However, the local node might have free pages left

     * after the fairness batches are exhausted, and remote zones haven't

     * even been considered yet.  Try once more without fairness, and

     * include remote zones now, before entering the slowpath and waking

     * kswapd: prefer spilling to a remote zone over swapping locally.

     */

    if (alloc_flags & ALLOC_FAIR) {

        alloc_flags &= ~ALLOC_FAIR;

        if (nr_fair_skipped) {

            zonelist_rescan = true;

            reset_alloc_batches(ac->preferred_zone);

        }

        if (nr_online_nodes > )

            zonelist_rescan = true;

    }

    if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {

        /* Disable zlc cache for second zonelist scan */

        zlc_active = ;

        zonelist_rescan = true;

    }

    if (zonelist_rescan)

        goto zonelist_scan;

    return NULL;

}

关于水位的计算在watermark中有详细介绍。

下面看看判断当前zone空闲页面是否满足alloc_flags指定水位的函数__zone_watermark_ok。

z-zone结构体，order待分配页面的阶数，mark水位数值，classzone_idx是zone序号，alloc_flags分配掩码，free_pages当前空闲页面数。

static bool __zone_watermark_ok(struct zone *z, unsigned int order,

            unsigned long mark, int classzone_idx, int alloc_flags,

            long free_pages)

{

    /* free_pages may go negative - that's OK */

    long min = mark;

    int o;

    long free_cma = ;

    free_pages -= ( << order) - ;---------------------------------------------减去待分配页面后剩余页面数，-1？？
if (alloc_flags & ALLOC_HIGH)

        min -= min / ;

    if (alloc_flags & ALLOC_HARDER)

        min -= min / ;

...

    if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空闲页面数要保证大于min值和lowmem_resreve保留值之和

        return false;

    for (o = ; o < order; o++) {-----------------------------------------------遍历buddy中比当前请求分配order小的所有order，依次检查free pages是否满足watermark需求

        /* At the next order, this order's pages become unavailable */

        free_pages -= z->free_area[o].nr_free << o;-----------------------------从总free_pages种减去当前order的free pages

        /* Require fewer higher order pages to be free */

        min >>= ;--------------------------------------------------------------水位值缩半

        if (free_pages <= min)--------------------------------------------------在比较是否满足水位需求

            return false;

    }

    return true;----------------------------------------------------------------以上所有条件都满足，返回True

}

函数中循环的目的可归结为：
依次循环，检查内存中是否有足够多的大块（即order比较高）空闲内存。
每次循环处理中，先把当前order的free page从总free pages中减掉，因为我们是看是否有足够多的大块内存。
当然，既然已经把free pages中的一部分已经划掉了，比较标准也应该相应放宽。
放宽多少，就是前面说的对min的右移处理。

参考：__zone_watermark_ok分析

zone_reclaim:

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

{

    int node_id;

    int ret;

    /*

     * Zone reclaim reclaims unmapped file backed pages and

     * slab pages if we are over the defined limits.

     *

     * A small portion of unmapped file backed pages is needed for

     * file I/O otherwise pages read by file I/O will be immediately

     * thrown out if the zone is overallocated. So we do not reclaim

     * if less than a specified percentage of the zone is used by

     * unmapped file backed pages.

     */

    if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&

        zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)

        return ZONE_RECLAIM_FULL;

    if (!zone_reclaimable(zone))

        return ZONE_RECLAIM_FULL;

    /*

     * Do not scan if the allocation should not be delayed.

     */

    if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))

        return ZONE_RECLAIM_NOSCAN;

    /*

     * Only run zone reclaim on the local zone or on zones that do not

     * have associated processors. This will favor the local processor

     * over remote processors and spread off node memory allocations

     * as wide as possible.

     */

    node_id = zone_to_nid(zone);

    if (node_state(node_id, N_CPU) && node_id != numa_node_id())

        return ZONE_RECLAIM_NOSCAN;

    if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))

        return ZONE_RECLAIM_NOSCAN;

    ret = __zone_reclaim(zone, gfp_mask, order);

    clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);

    if (!ret)

        count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

    return ret;

}

buffered_rmqueue:

/*

 * Allocate a page from the given zone. Use pcplists for order-0 allocations.

 */

static inline

struct page *buffered_rmqueue(struct zone *preferred_zone,

            struct zone *zone, unsigned int order,

            gfp_t gfp_flags, int migratetype)

{

    unsigned long flags;

    struct page *page;

    bool cold = ((gfp_flags & __GFP_COLD) != );

    if (likely(order == )) {

        struct per_cpu_pages *pcp;

        struct list_head *list;

        local_irq_save(flags);

        pcp = &this_cpu_ptr(zone->pageset)->pcp;

        list = &pcp->lists[migratetype];

        if (list_empty(list)) {

            pcp->count += rmqueue_bulk(zone, ,

                    pcp->batch, list,

                    migratetype, cold);

            if (unlikely(list_empty(list)))

                goto failed;

        }

        if (cold)

            page = list_entry(list->prev, struct page, lru);

        else

            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);

        pcp->count--;

    } else {

        if (unlikely(gfp_flags & __GFP_NOFAIL)) {

            /*

             * __GFP_NOFAIL is not to be used in new code.

             *

             * All __GFP_NOFAIL callers should be fixed so that they

             * properly detect and handle allocation failures.

             *

             * We most definitely don't want callers attempting to

             * allocate greater than order-1 page units with

             * __GFP_NOFAIL.

             */

            WARN_ON_ONCE(order > );

        }

        spin_lock_irqsave(&zone->lock, flags);

        page = __rmqueue(zone, order, migratetype);

        spin_unlock(&zone->lock);

        if (!page)

            goto failed;

        __mod_zone_freepage_state(zone, -( << order),

                      get_freepage_migratetype(page));

    }

    __mod_zone_page_state(zone, NR_ALLOC_BATCH, -( << order));

    if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <=  &&

        !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))

        set_bit(ZONE_FAIR_DEPLETED, &zone->flags);

    __count_zone_vm_events(PGALLOC, zone,  << order);

    zone_statistics(preferred_zone, zone, gfp_flags);

    local_irq_restore(flags);

    VM_BUG_ON_PAGE(bad_range(zone, page), page);

    return page;

failed:

    local_irq_restore(flags);

    return NULL;

}

3. 释放页面

__free_page

free_page-->free_pages

    __free_pages

        free_hot_cold_page

        __free_pages_ok

4. 伙伴系统相关节点

4.1 /proc/pagetypeinfo

Page block order:

Pages per block:  

Free pages count per migrate type at order

Node    , zone   Normal, type    Unmovable

Node    , zone   Normal, type  Reclaimable

Node    , zone   Normal, type      Movable

Node    , zone   Normal, type      Reserve

Node    , zone   Normal, type          CMA

Node    , zone   Normal, type      Isolate

Node    , zone  HighMem, type    Unmovable

Node    , zone  HighMem, type  Reclaimable

Node    , zone  HighMem, type      Movable

Node    , zone  HighMem, type      Reserve

Node    , zone  HighMem, type          CMA

Node    , zone  HighMem, type      Isolate                                                                   

Number of blocks type     Unmovable  Reclaimable      Movable      Reserve          CMA      Isolate

Node , zone   Normal

Node , zone  HighMem

秒客网

Linux内存管理 (4)分配物理页面