作者簡介:偉林,中年碼農(nóng),從事過電信、手機、安全、芯片等行業(yè),目前依舊從事Linux方向開發(fā)工作,個人愛好Linux相關(guān)知識分享。
內(nèi)存分配
-
gfp_mask
node 候選策略
zone 候選策略
zone fallback 策略
lowmem reserve 機制
order fallback 策略
migrate type 候選策略
migrate fallback 策略
reclaim watermark
reclaim 方式
alloc_pages()
內(nèi)存釋放
Buddy 系統(tǒng)中,相比較內(nèi)存的分配,內(nèi)存的釋放過程更簡單,我們先來解析這部分。
這里體現(xiàn)了 Buddy 的核心思想:在內(nèi)存釋放時判斷其 buddy 兄弟 page 是不是 order 大小相等的 free page,如果是則合并成更高一階 order。這樣的目的是最大可能的減少內(nèi)存碎片化。
內(nèi)存釋放最后都會落到 __free_pages() 函數(shù):
void __free_pages(struct page *page, unsigned int order)
{
/* (1) 對page->_refcount減1后并判斷是否為0
如果引用計數(shù)為0了,說明可以釋放page了
*/
if (put_page_testzero(page))
free_the_page(page, order);
}
↓
static inline void free_the_page(struct page *page, unsigned int order)
{
/* (1) 單個 page 首先嘗試釋放到 pcp */
if (order == 0) /* Via pcp? */
free_unref_page(page);
/* (2) 大于 1 的 2^order 個 page,釋放到 order free_area_ 當(dāng)中 */
else
__free_pages_ok(page, order);
}
↓
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
/* (2.1) page釋放前的一些動作:
清理一些成員
做一些檢查
執(zhí)行一些回調(diào)函數(shù)
*/
if (!free_pages_prepare(page, order, true))
return;
/* (2.2) 獲取到page所在pageblock的migrate type
當(dāng)前page會被釋放到對應(yīng)order free_area的對應(yīng) migrate freelist鏈表當(dāng)中
*/
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
/* (2.3) 向zone中釋放page */
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
↓
free_one_page()
↓
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long combined_pfn;
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
/* (2.3.1) 嘗試對釋放的(2^order)長度的page進行逐級向上合并 */
while (order < max_order - 1) {
/* (2.3.1.1) 得到當(dāng)前釋放的(2^order)長度page對應(yīng)的buddy伙伴page指針
計算伙伴buddy使用和(1<<order)進行異或:(0<<order)pfn對應(yīng)的伙伴page為(1<<order)pfn,(1<<order)pfn對應(yīng)的伙伴page為(0<<order)pfn
*/
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
if (!pfn_valid_within(buddy_pfn))
goto done_merging;
/* (2.3.1.2) 判斷伙伴page的是否是buddy狀態(tài):
是否是free狀態(tài)在buddy系統(tǒng)中(page->_mapcount == PAGE_BUDDY_MAPCOUNT_VALUE)
當(dāng)前的free order和要釋放的order相等(page->private == order)
*/
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) {
clear_page_guard(zone, buddy, order, migratetype);
} else {
/* (2.3.1.3) 如果滿足合并的條件,則準備開始合并
把伙伴page從原freelist中刪除
*/
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
/* 清理page中保存的order信息:
page->_mapcount = -1
page->private = 0
*/
rmv_page_order(buddy);
}
/* (2.3.1.4) 組成了更高一級order的空閑內(nèi)存 */
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
/* If we are here, it means order is >= pageblock_order.
* 如果在這里,意味著order >= pageblock_order。
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
* 我們要防止隔離頁面塊和正常頁面塊上的空閑頁面合并。 否則,頁面塊隔離可能導(dǎo)致不正確的空閑頁面或CMA計數(shù)。
*
* We don't want to hit this code for the more frequent
* low-order merging.
* 我們不想命中此代碼進行頻繁的低階合并。
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order++;
goto continue_merging;
}
/* (2.3.2) 開始掛載合并成order的空閑內(nèi)存 */
done_merging:
/* (2.3.2.1) page中保存order大?。?
page->_mapcount = PAGE_BUDDY_MAPCOUNT_VALUE(-128)
page->private = order
*/
set_page_order(page, order);
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
* 如果這不是最大的頁面,請檢查倒數(shù)第二個order的伙伴是否空閑。 如果是這樣,則可能是頁面即將被釋放,即將合并。 萬一發(fā)生這種情況,請將空閑頁面添加到列表的末尾,這樣它就不太可能很快被使用,而更有可能被合并為高階頁面
*/
/* (2.3.2.2) 將空閑page加到對應(yīng)order鏈表的尾部 */
if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
}
}
/* (2.3.2.3) 將空閑page加到對應(yīng)order鏈表的開始 */
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
zone->free_area[order].nr_free++;
}
PageBuddy()用來判斷page是否在buddy系統(tǒng)中,還有很多類似的page操作函數(shù)都定義在page-flags.h當(dāng)中:
linux-source-4.15.0includelinuxpage-flags.h:
#define PAGE_MAPCOUNT_OPS(uname, lname)
static __always_inline int Page##uname(struct page *page)
{
return atomic_read(&page->_mapcount) ==
PAGE_##lname##_MAPCOUNT_VALUE;
}
static __always_inline void __SetPage##uname(struct page *page)
{
VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE);
}
static __always_inline void __ClearPage##uname(struct page *page)
{
VM_BUG_ON_PAGE(!Page##uname(page), page);
atomic_set(&page->_mapcount, -1);
}
/*
* PageBuddy() indicate that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
對于單個page,會首先釋放到percpu緩存中:
start_kernel() → mm_init() → mem_init() → free_all_bootmem() free_low_memory_core_early() → __free_memory_core() → __free_pages_memory() → __free_pages_bootmem() → __free_pages() → free_the_page() → free_unref_page():
↓
void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
/* (1) 一些初始化準備工作
page->index = migratetype;
*/
if (!free_unref_page_prepare(page, pfn))
return;
local_irq_save(flags);
/* (2) 釋放page到pcp中 */
free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
↓
static void free_unref_page_commit(struct page *page, unsigned long pfn)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int migratetype;
/* (2.1) migratetype = page->index */
migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
/* (2.2) 對于某些migratetype的特殊處理 */
if (migratetype >= MIGRATE_PCPTYPES) {
/* (2.2.1) 對于isolate類型,free到全局的freelist中 */
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, 0, migratetype);
return;
}
migratetype = MIGRATE_MOVABLE;
}
/* (2.3) 獲取到zone當(dāng)前cpu pcp的鏈表頭 */
pcp = &this_cpu_ptr(zone->pageset)->pcp;
/* (2.4) 將空閑的單page加入到pcp對應(yīng)鏈表中 */
list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
/* (2.5) 如果pcp中的page數(shù)量過多(大于pcp->high),釋放pcp->batch個page到全局free list當(dāng)中去 */
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
}
pcp->high 和 pcp->batch 的賦值過程:
start_kernel() → setup_per_cpu_pageset() → setup_zone_pageset() → zone_pageset_init() → pageset_set_high_and_batch():
|→
static int zone_batchsize(struct zone *zone)
{
/* batch 的大小 = (zone_size / (1024*4)) * (3/2) */
batch = zone->managed_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
}
|→
static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
{
/* high = 6 * batch */
pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
}
內(nèi)存分配
相比較釋放,內(nèi)存分配的策略要復(fù)雜的多,要考慮的因素也多很多,讓我們一一來解析。
gfp_mask
gfp_mask是GFP(Get Free Page)相關(guān)的一系列標志,控制了分配page的一系列行為。
node 候選策略
在 NUMA 的情況下,會有多個 memory node 可供選擇,系統(tǒng)會根據(jù) policy 選擇當(dāng)前分配的 node。
alloc_pages() → alloc_pages_current():
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
/* (1.1) 使用默認NUMA策略 */
struct mempolicy *pol = &default_policy;
struct page *page;
/* (1.2) 獲取當(dāng)前進程的NUMA策略 */
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
/* (2) 從NUMA策略指定的首選node和備選node組上,進行內(nèi)存頁面的分配 */
page = __alloc_pages_nodemask(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
zone 候選策略
Buddy 系統(tǒng)中對每一個 node 定義了多個類型的 zone :
enum zone_type {
ZONE_DMA,
ZONE_DMA32,
ZONE_NORMAL,
ZONE_HIGHMEM,
ZONE_MOVABLE,
ZONE_DEVICE,
__MAX_NR_ZONES
};
gfp_mask 中也定義了一系列選擇 zone 的flag:
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
*/
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
怎么樣根據(jù) gfp_mask 中的 zone modifiers 來選擇分配鎖使用的 zone 呢?系統(tǒng)設(shè)計了一套算法來進行轉(zhuǎn)換:
具體的代碼如下:
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → prepare_alloc_pages() → gfp_zone():
static inline enum zone_type gfp_zone(gfp_t flags)
{
enum zone_type z;
/* (1) gfp 標志中低4位為 zone modifiers */
int bit = (__force int) (flags & GFP_ZONEMASK);
/* (2) 查表得到最后的候選zone
內(nèi)核規(guī)定 ___GFP_DMA,___GFP_HIGHMEM 和 ___GFP_DMA32 其兩個或全部不能同時存在于 gfp 標志中
*/
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
#define GFP_ZONE_TABLE (
(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)
| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)
| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)
)
#define GFP_ZONE_BAD (
1 << (___GFP_DMA | ___GFP_HIGHMEM)
| 1 << (___GFP_DMA | ___GFP_DMA32)
| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)
| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)
| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)
)
zone fallback 策略
通過上述的候選策略,我們選定了內(nèi)存分配的 node 和 zone,然后開始分配。如果分配失敗,我們并不會馬上啟動內(nèi)存回收,而是通過 fallback 機制嘗試從其他低級的 zone 中看看能不能借用一些內(nèi)存。
fallback 的借用,只能從高級到低級的借用,而不能從低級到高級的借用。比如:原本想分配 Normal zone 的內(nèi)存,失敗的情況下可以嘗試從 DMA32 zone 中分配內(nèi)存,因為能用 normal zone 地址范圍的內(nèi)存肯定也可以用 DMA32 zone 地址范圍的內(nèi)存。但是反過來就不行,原本需要 DMA32 zone 地址范圍的內(nèi)存,你給他一個 normal zone 的內(nèi)存,地址超過了4G,可能就超過了 DMA 設(shè)備的尋址能力。
系統(tǒng)還定義了一個 __GFP_THISNODE 標志,用來限制 fallback 時只能在本 node 上尋找合適的低級 zone。否則會在所有 node 上尋找合適的低級 zone。
該算法的具體實現(xiàn)如下:
pgdat->node_zonelists[ZONELIST_FALLBACK] // 跨 node FALLBACK機制生效,用來鏈接所有node的所有zone
pgdat->node_zonelists[ZONELIST_NOFALLBACK] // 如果gfp_mask設(shè)置了__GFP_THISNODE標志,跨 node FALLBACK機制失效,用來鏈接本node的所有zone
系統(tǒng)啟動時初始化這些鏈表:
start_kernel() → build_all_zonelists() → __build_all_zonelists() → build_zonelists() → build_zonelists_in_node_order()/build_thisnode_zonelists() → build_zonerefs_node():
2、內(nèi)存分配時確定使用的 fallback 鏈表:
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → prepare_alloc_pages() → node_zonelist():
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
/* (1) 根據(jù)fallback機制是否使能,來選擇候選zone鏈表 */
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
/* (1.1) 如果gfp_mask指定了__GFP_THISNODE,則跨 node fallback機制失效 */
if (unlikely(flags & __GFP_THISNODE))
return ZONELIST_NOFALLBACK;
#endif
/* (1.2) 否則,跨 node fallback機制生效 */
return ZONELIST_FALLBACK;
}
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → finalise_ac():
static inline void finalise_ac(gfp_t gfp_mask,
unsigned int order, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/* (2) 從fallback list中選取最佳候選zone,即本node的符合zone type條件的最高zone */
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
3、從原有zone分配失敗時,嘗試從 fallback zone 中分配內(nèi)存:
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → get_page_from_freelist():
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
/* (1) 如果分配失敗,遍歷 fallback list 中的 zone,逐個嘗試分配 */
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
}
}
lowmem reserve 機制
承接上述的 fallback 機制,高等級的 zone 可以借用低等級 zone 的內(nèi)存。但是從理論上說,低等級的內(nèi)存更加的寶貴因為它的空間更小,如果被高等級的侵占完了,那么用戶需要低層級內(nèi)存的時候就會分配失敗。
為了解決這個問題,系統(tǒng)給每個 zone 能夠給其他高等級 zone 借用的內(nèi)存設(shè)置了一個預(yù)留值,可以借用內(nèi)存但是本zone保留的內(nèi)存不能小于這個值。
我們可以通過命令來查看每個 zone 的 lowmem reserve 大小設(shè)置,protection 字段描述了本zone給其他zone借用時必須保留的內(nèi)存:
pwl@ubuntu:~$ cat /proc/zoneinfo
Node 0, zone DMA
pages free 3968
min 67
low 83
high 99
spanned 4095
present 3997
managed 3976
// 本 zone 為 DMA
// 給 DMA zone 借用時必須保留 0 pages
// 給 DMA32 zone 借用時必須保留 2934 pages
// 給 Normal/Movable/Device zone 借用時必須保留 3859 pages
protection: (0, 2934, 3859, 3859, 3859)
Node 0, zone DMA32
pages free 418978
min 12793
low 15991
high 19189
spanned 1044480
present 782288
managed 759701
// 本 zone 為 DMA32
// 給 DMA/DMA32 zone 借用時必須保留 0 pages
// 給 Normal/Movable/Device zone 借用時必須保留 925 pages
protection: (0, 0, 925, 925, 925)
nr_free_pages 418978
Node 0, zone Normal
pages free 4999
min 4034
low 5042
high 6050
spanned 262144
present 262144
managed 236890
// 本 zone 為 Normal
// 因為 Movable/Device zone 大小為0,所以給所有 zone 借用時必須保留 0 pages
protection: (0, 0, 0, 0, 0)
Node 0, zone Movable
pages free 0
min 0
low 0
high 0
spanned 0
present 0
managed 0
protection: (0, 0, 0, 0, 0)
Node 0, zone Device
pages free 0
min 0
low 0
high 0
spanned 0
present 0
managed 0
protection: (0, 0, 0, 0, 0)
可以通過lowmem_reserve_ratio來調(diào)節(jié)這個值的大?。?/p>
pwl@ubuntu:~$ cat /proc/sys/vm/lowmem_reserve_ratio
256 256 32 0 0
order fallback 策略
Buddy 系統(tǒng)中對每一個 zone 又細分了多個 order 的 free_area:
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
如果在對應(yīng) order 的 free_area 中找不多 free 內(nèi)存的話,會逐個往高級別 order free_area 中查找,直至 max_order。
對高級別 order 的 freelist ,會被分割成多個低級別 order 的 freelist。
migrate type 候選策略
Buddy 系統(tǒng)中對每一個 zone 中的每一個 order free_area 又細分了多個 migrate type :
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
MIGRATE_CMA,
MIGRATE_ISOLATE, /* can't allocate from here */
MIGRATE_TYPES
};
gfp_mask 中也定義了一系列選擇 migrate type 的flag:
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
根據(jù) gfp_mask 轉(zhuǎn)換成 migrate type 的代碼如下:
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → prepare_alloc_pages() → gfpflags_to_migratetype():
static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
/* Group based on mobility */
/* (1) 轉(zhuǎn)換的結(jié)果僅為3種類型:MIGRATE_UNMOVABLE/MIGRATE_MOVABLE/MIGRATE_RECLAIMABLE */
return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
migrate fallback 策略
在指定 migrate type 的 order 和大于 order 的 free list 分配失敗時,可以從同一 zone 的其他 migrate type freelist 中偷取內(nèi)存。
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
fallbacks[] 數(shù)組定義了當(dāng)前 migrate 可以從偷取哪些其他 migrate 的空閑內(nèi)存,基本就是 MIGRATE_UNMOVABLE、MIGRATE_RECLAIMABLE、MIGRATE_MOVABLE 可以相互偷取。
具體的代碼如下:
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask() → get_page_from_freelist() → rmqueue() → __rmqueue() → __rmqueue_fallback():
reclaim watermark
分配時如果 freelist 中現(xiàn)有的內(nèi)存不能滿足需求,則會啟動內(nèi)充回收。系統(tǒng)對每個 zone 定義了三種內(nèi)存水位 high/low/min,針對不同的水位采取不同的回收策略:
pwl@ubuntu:~$ cat /proc/zoneinfo
Node 0, zone DMA
pages free 3968
min 67
low 83
high 99
具體三種水位的回收策略如下:
reclaim 方式
系統(tǒng)設(shè)計了幾種回收內(nèi)存的手段:
alloc_pages()
Buddy 內(nèi)存分配的核心代碼實現(xiàn)。
alloc_pages() → alloc_pages_current() → __alloc_pages_nodemask():
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
/* (1.1) 默認的允許水位為low */
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
/* (1.2) order長度的合法性判斷 */
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/* (1.3) gfp_mask的過濾 */
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
/* (1.4) 根據(jù)gfp_mask,決定的high_zoneidx、候選zone list、migrate type */
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
/* (1.5) 挑選第一個合適的zone */
finalise_ac(gfp_mask, order, &ac);
/* First allocation attempt */
/* (2) 第1次分配:使用low水位嘗試直接從free list分配page */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
/* (3.1) 如果使用 memalloc_no{fs,io}_{save,restore} 設(shè)置了 NOFS和NOIO
從 current->flags 解析出相應(yīng)的值,用來清除 gfp_mask 中相應(yīng)的 __GFP_FS 和 __GFP_IO 標志
*/
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
/* (3.2) 恢復(fù)原有的nodemask */
if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
/* (4) 慢速分配路徑:使用min水位,以及各種手段進行內(nèi)存回收后,再嘗試分配內(nèi)存 */
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
|→
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
/* (1.4.1) 根據(jù)gfp_mask,獲取到可能的最高優(yōu)先級的zone */
ac->high_zoneidx = gfp_zone(gfp_mask);
/* (1.4.2) 根據(jù)gfp_mask,獲取到可能候選node的所有zone鏈表 */
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
/* (1.4.3) 根據(jù)gfp_mask,獲取到migrate type
MIGRATE_UNMOVABLE/MIGRATE_MOVABLE/MIGRATE_RECLAIMABLE
*/
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
/* (1.4.4) 如果cpuset cgroup使能,設(shè)置相應(yīng)標志位 */
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
/* (1.4.5) 如果指定了__GFP_FS,則嘗試獲取fs鎖 */
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
/* (1.4.6) 如果指定了__GFP_DIRECT_RECLAIM,判斷當(dāng)前是否是非原子上下文可以睡眠 */
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return false;
/* (1.4.7) 讓MIGRATE_MOVABLE可以使用MIGRATE_CMA區(qū)域 */
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
return true;
}
get_page_from_freelist()
第一次的快速內(nèi)存分配,和后續(xù)的慢速內(nèi)存分配,最后都是調(diào)用 get_page_from_freelist() 從freelist中獲取內(nèi)存。
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
/* (2.5.1) 輪詢 fallback zonelist鏈表,在符合條件(idx<=high_zoneidx)的zone中嘗試分配內(nèi)存 */
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/* (2.5.2) 如果__GFP_WRITE指示了分配頁的用途是dirty,平均分布臟頁
查詢node上分配的臟頁是否超過限制,超過則換node
*/
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
/* (2.5.3) 獲取當(dāng)前分配能超越的水位線 */
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/* (2.5.4) 判斷當(dāng)前zone中的free page是否滿足條件:
1、total free page >= (2^order) + watermark + lowmem_reserve
2、是否有符合要求的長度為(2^order)的連續(xù)內(nèi)存
*/
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
/* (2.5.5) 如果沒有足夠的free內(nèi)存,則進行下列的判斷 */
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
/* (2.5.6) 如果可以忽略水位線,則直接進行分配嘗試 */
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
/* (2.5.7) 快速內(nèi)存回收嘗試回收(2^order)個page
快速回收不能進行unmap,writeback操作,回收priority為4,即最多嘗試調(diào)用shrink_node進行回收的次數(shù)為priority值
在__node_reclaim()中使用以下 scan_control 參數(shù)來調(diào)用shrink_node(),
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = current_gfp_context(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), // 默認為0
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), // 默認為0
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
*/
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
/* (2.5.8) 如果回收成功,重新判斷空閑內(nèi)存是否已經(jīng)足夠 */
if (zone_watermark_ok(zone, order, mark,
ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
}
}
try_this_zone:
/* (2.5.9) 滿足條件,嘗試實際的從free list中摘取(2^order)個page */
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
/* (2.5.10) 分配到內(nèi)存后,對 struct page 的一些處理 */
prep_new_page(page, order, gfp_mask, alloc_flags);
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
}
}
return NULL;
}
||→
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, unsigned int alloc_flags)
{
/* (2.5.4.1) 獲取當(dāng)前zone中free page的數(shù)量 */
long free_pages = zone_page_state(z, NR_FREE_PAGES);
long cma_pages = 0;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
/* (2.5.4.2) 對order=0的長度,進行快速檢測free內(nèi)存是否夠用 */
if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
return true;
/* (2.5.4.3) 慢速檢測free內(nèi)存是否夠用 */
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
|||→
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int classzone_idx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
int o;
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
/* (2.5.4.3.1) 首先用free page總數(shù)減去需要的order長度,判斷剩下的長度是不是還超過水位線 */
free_pages -= (1 << order) - 1;
/* (2.5.4.3.2) 如果是優(yōu)先級高,水位線可以減半 */
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
/*
* If the caller does not have rights to ALLOC_HARDER then subtract
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
/* (2.5.4.3.3) 非harder類的分配,free內(nèi)存還需預(yù)留nr_reserved_highatomic的內(nèi)存 */
if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
/* (2.5.4.3.4) harder類的分配,非常緊急了,水位線還可以繼續(xù)減半縮小 */
} else {
/*
* OOM victims can try even harder than normal ALLOC_HARDER
* users on the grounds that it's definitely going to be in
* the exit path shortly and free memory. Any allocation it
* makes during the free path will be small and short-lived.
*/
if (alloc_flags & ALLOC_OOM)
min -= min / 2;
else
min -= min / 4;
}
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
/* (2.5.4.3.5) 非CMA的分配,free內(nèi)存還需預(yù)留CMA內(nèi)存 */
if (!(alloc_flags & ALLOC_CMA))
free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
/* (2.5.4.3.6) free內(nèi)存還要預(yù)留(水位內(nèi)存+lowmem_reserve[classzone_idx])
如果減去上述所有的預(yù)留內(nèi)存內(nèi)存后,還大于請求的order長度,說明當(dāng)前zone中的free內(nèi)存總長度滿足請求分配的order
但是有沒有符合要求的長度為(2^order)的連續(xù)內(nèi)存,還要進一步查找判斷
*/
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return false;
/* If this is an order-0 request then the watermark is fine */
/* (2.5.4.3.7) 如果order為0,不用進一步判斷了,總長度滿足,肯定能找到合適長度的page */
if (!order)
return true;
/* For a high-order request, check at least one suitable page is free */
/* (2.5.4.3.8) 逐個查詢當(dāng)前zone中大于請求order的鏈表 */
for (o = order; o < MAX_ORDER; o++) {
struct free_area *area = &z->free_area[o];
int mt;
if (!area->nr_free)
continue;
/* (2.5.4.3.9) 逐個查詢當(dāng)前order中的每個migrate type鏈表,如果不為空則返回成功 */
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!list_empty(&area->free_list[mt]))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
!list_empty(&area->free_list[MIGRATE_CMA])) {
return true;
}
#endif
if (alloc_harder &&
!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
return true;
}
return false;
}
rmqueue()
找到合適有足夠 free 內(nèi)存的zone以后,rmqueue()負責(zé)從 freelist 中摘取 page。
rmqueue() → __rmqueue():
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype)
{
struct page *page;
retry:
/* (1) 從原始指定的 migrate freeist 中分配內(nèi)存 */
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
/* (2) 如果上一步分配失敗,嘗試從其他 migrate list 中偷取內(nèi)存來分配 */
if (!page && __rmqueue_fallback(zone, order, migratetype))
goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
↓
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
/* (1.1) 逐個查詢 >= order 的 freaa_area 中 migratetype 的freelist,看看是否有free內(nèi)存 */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
if (!page)
continue;
/* (1.1.1) 從 freelist 中摘取內(nèi)存 */
list_del(&page->lru);
/* 清理page中保存的order信息:
page->_mapcount = -1
page->private = 0
*/
rmv_page_order(page);
area->nr_free--;
/* (1.1.2) 把剩余內(nèi)存重新掛載到低階 order 的freelist中 */
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
__alloc_pages_slowpath()
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
/* (1) 設(shè)置各種標志:
ALLOC_WMARK_MIN,水位降低到 min
ALLOC_HARDER,如果是 atomic 或者 rt_task,進一步降低水位
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
/* (2) 重新安排 fallback zone list */
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
/* (3) 進入慢速路徑,說明在 low 水位下已經(jīng)分配失敗了,
所以先喚醒 kswapd 異步回收線程
*/
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
/* (4) 第2次分配:使用min水位嘗試直接從free list分配page */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* 對于昂貴的分配,首先嘗試直接壓縮,因為我們可能有足夠的基本頁,不需要回收。對于不可移動的高階分配,也要這樣做,因為壓縮將嘗試通過從相同migratetype的塊遷移來防止永久的碎片化。
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
* 不要嘗試這個分配而允許忽略水位,因為alloc_no_watermark嘗試還沒有發(fā)生。
*/
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
/* (5) 第3次分配:內(nèi)存壓縮compact后,嘗試分配 get_page_from_freelist() */
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
* this is the case and the caller requested a THP
* allocation, we do not want to heavily disrupt the
* system, so we fail the allocation instead of entering
* direct reclaim.
*/
if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
/* (6) 再一次喚醒 kswapd 異步回收線程,可能ac參數(shù)變得更嚴苛了 */
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/* (7) 設(shè)置各種標志:
ALLOC_NO_WATERMARKS,進一步降低水位,直接忽略水位
*/
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
/* (8) 第4次分配:使用no水位嘗試直接從free list分配page */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
/* (9) 如果當(dāng)前不支持直接回收,則退出,等待 kswapd 異步線程的回收 */
if (!can_direct_reclaim)
goto nopage;
/* Avoid recursion of direct reclaim */
/* (10) 避免遞歸回收 */
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Try direct reclaim and then allocating */
/* (11) 第5次分配:直接啟動內(nèi)存回收后,并嘗試page get_page_from_freelist() */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating */
/* (12) 第6次分配:直接啟動內(nèi)存壓縮后,并嘗試page get_page_from_freelist() */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested */
/* (13) 如果還是分配失敗,且不支持重試,出錯返回 */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*/
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
/* (14) 檢查重試內(nèi)存回收是否有意義 */
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
/* (15) 檢查重試內(nèi)存壓縮是否有意義 */
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;
/* Deal with possible cpuset update races before we start OOM killing */
/* (16) 在啟動 OOM kiling 之前,是否有可能更新 cpuset 來進行重試 */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
/* (17) 第7次分配:所有的內(nèi)存回收嘗試都已經(jīng)失敗,祭出最后的大招:通過殺進程來釋放內(nèi)存 */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
/* (18) 避免無止境循環(huán)的無水位分配 */
if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
/* (19) 在OOM killing取得進展時重試 */
if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}
nopage:
/* Deal with possible cpuset update races before we fail */
/* (20) 在我們失敗之前處理可能的cpuset更新 */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
/* (21) 如果指定了 __GFP_NOFAIL,只能不停的進行重試 */
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
/* (22) 構(gòu)造分配失敗的告警信息 */
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}