diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 988f6a4c8084fb..498655c322bc8f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -72,7 +72,6 @@ Currently, these files are in /proc/sys/vm: - unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure -- watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode @@ -968,26 +967,6 @@ directory and inode objects. With vfs_cache_pressure=1000, it will look for ten times more freeable objects than there are. -watermark_boost_factor -====================== - -This factor controls the level of reclaim when memory is being fragmented. -It defines the percentage of the high watermark of a zone that will be -reclaimed if pages of different mobility are being mixed within pageblocks. -The intent is that compaction has less work to do in the future and to -increase the success rate of future high-order allocations such as SLUB -allocations, THP and hugetlbfs pages. - -To make it sensible with respect to the watermark_scale_factor -parameter, the unit is in fractions of 10,000. The default value of -15,000 means that up to 150% of the high watermark will be reclaimed in the -event of a pageblock being mixed due to fragmentation. The level of reclaim -is determined by the number of fragmentation events that occurred in the -recent past. If this value is smaller than a pageblock then a pageblocks -worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor -of 0 will disable the feature. - - watermark_scale_factor ====================== diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88188549647c72..5f771d65b6f887 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1229,6 +1229,8 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); + else + hugetlb_cma_reserve(PMD_SHIFT - PAGE_SHIFT); /* * Reserve memory for crash kernel after SRAT is parsed so that it diff --git a/block/bdev.c b/block/bdev.c index edc110d90df404..6abe4766d0737a 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -488,7 +488,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) inode->i_mode = S_IFBLK; inode->i_rdev = 0; inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); + mapping_set_gfp_mask(&inode->i_data, GFP_USER|__GFP_MOVABLE); bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 52a9ff65faee6d..da5fd98df3635a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -10,7 +10,6 @@ enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, - MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC @@ -56,6 +55,7 @@ enum compact_result { }; struct alloc_context; /* in mm/internal.h */ +struct capture_control; /* in mm/internal.h */ /* * Number of free order-0 pages that should be available above given watermark @@ -94,10 +94,10 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, - struct page **page); + struct capture_control *capc); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx); + int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -187,7 +187,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int highest_zoneidx) + int highest_zoneidx) { return COMPACT_SKIPPED; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 65a78773dccad2..78b5176d354efa 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -19,8 +19,6 @@ static inline int gfp_migratetype(const gfp_t gfp_flags) BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); - BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> - GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; diff --git a/include/linux/mm.h b/include/linux/mm.h index f13f20258ce989..e7c2631848ed49 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2746,7 +2746,6 @@ extern void setup_per_cpu_pageset(void); /* page_alloc.c */ extern int min_free_kbytes; -extern int watermark_boost_factor; extern int watermark_scale_factor; extern bool arch_has_descending_max_zone_pfns(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4f7..1363ff6caff3b1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -44,7 +44,7 @@ enum migratetype { MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ - MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, + MIGRATE_FREE = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -88,7 +88,7 @@ static inline bool is_migrate_movable(int mt) */ static inline bool migratetype_is_mergeable(int mt) { - return mt < MIGRATE_PCPTYPES; + return mt < MIGRATE_PCPTYPES || mt == MIGRATE_FREE; } #define for_each_migratetype_order(order, type) \ @@ -138,6 +138,10 @@ enum numa_stat_item { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_FREE_UNMOVABLE, + NR_FREE_MOVABLE, + NR_FREE_RECLAIMABLE, + NR_FREE_FREE, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, @@ -552,23 +556,21 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list - * for THP which will usually be GFP_MOVABLE. Even if it is another type, - * it should not contribute to serious fragmentation causing THP allocation - * failures. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional set + * for THP (usually GFP_MOVABLE, but with exception of the huge zero page.) */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define NR_PCP_THP 1 +#define NR_PCP_THP MIGRATE_PCPTYPES #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) -#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) -#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) -#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) +#define min_wmark_pages(z) (z->_watermark[WMARK_MIN]) +#define low_wmark_pages(z) (z->_watermark[WMARK_LOW]) +#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH]) +#define wmark_pages(z, i) (z->_watermark[i]) /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { @@ -707,9 +709,6 @@ struct zone { /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; - unsigned long watermark_boost; - - unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be @@ -884,9 +883,6 @@ enum pgdat_flags { }; enum zone_flags { - ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. - * Cleared when kswapd is woken. - */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ }; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 5456b7be38ae50..b519fffb3deee2 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -35,26 +35,14 @@ static inline bool is_migrate_isolate(int migratetype) void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable); - -/* - * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. - */ -int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); - -/* - * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. - * target range is [start_pfn, end_pfn) - */ -void -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype); - -/* - * Test all pages in [start_pfn, end_pfn) are isolated or not. - */ + int old_mt, int new_mt, int *num_movable); + +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags); + +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype); + int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags); diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 5f1ae07d724b88..05b6811f8cee5e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -47,8 +47,8 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE */ -/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order (MAX_ORDER-1) +/* Manage fragmentation at the 2M level */ +#define pageblock_order ilog2(2U << (20 - PAGE_SHIFT)) #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 19cf5b6892ceba..219ccf3f91cdc9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -481,14 +481,6 @@ static inline void node_stat_sub_folio(struct folio *folio, mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, - int migratetype) -{ - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); - if (is_migrate_cma(migratetype)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); -} - extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137d4abe3eda11..68bcd3a7c9c6b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2229,14 +2229,6 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = SYSCTL_ZERO, }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "watermark_scale_factor", .data = &watermark_scale_factor, diff --git a/mm/compaction.c b/mm/compaction.c index 8238e83385a791..5890d64d814e08 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -745,8 +745,9 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(pg_data_t *pgdat) +static bool too_many_isolated(struct compact_control *cc) { + pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; @@ -758,6 +759,16 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); + /* + * GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a + * circular deadlock. GFP_NOIO won't get here. + */ + if (cc->gfp_mask & __GFP_FS) { + inactive >>= 3; + active >>= 3; + } + too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); @@ -806,7 +817,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(pgdat))) { + while (unlikely(too_many_isolated(cc))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) return -EAGAIN; @@ -1247,51 +1258,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -static bool suitable_migration_source(struct compact_control *cc, - struct page *page) -{ - int block_mt; - - if (pageblock_skip_persistent(page)) - return false; - - if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) - return true; - - block_mt = get_pageblock_migratetype(page); - - if (cc->migratetype == MIGRATE_MOVABLE) - return is_migrate_movable(block_mt); - else - return block_mt == cc->migratetype; -} - -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct compact_control *cc, - struct page *page) -{ - /* If the page is a large free page, then disallow migration */ - if (PageBuddy(page)) { - /* - * We are checking page_order without zone->lock taken. But - * the only small danger is that we skip a potentially suitable - * pageblock, so it's not worth to check order for valid range. - */ - if (buddy_order_unsafe(page) >= pageblock_order) - return false; - } - - if (cc->ignore_block_suitable) - return true; - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (is_migrate_movable(get_pageblock_migratetype(page))) - return true; - - /* Otherwise skip the block */ - return false; -} - static inline unsigned int freelist_scan_limit(struct compact_control *cc) { @@ -1614,7 +1580,7 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(cc, page)) + if (!is_migrate_movable(get_pageblock_migratetype(page))) continue; /* If isolation recently failed, do not retry */ @@ -1778,15 +1744,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn; - /* - * Only allow kcompactd and direct requests for movable pages to - * quickly clear out a MOVABLE pageblock for allocation. This - * reduces the risk that a large movable pageblock is freed for - * an unmovable/reclaimable small allocation. - */ - if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) - return pfn; - /* * When starting the migration scanner, pick any pageblock within the * first half of the search space. Otherwise try and pick a pageblock @@ -1930,14 +1887,12 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) continue; /* - * For async direct compaction, only scan the pageblocks of the - * same migratetype without huge pages. Async direct compaction - * is optimistic to see if the minimum amount of work satisfies - * the allocation. The cached PFN is updated as it's possible - * that all remaining blocks between source and target are + * The cached PFN is updated as it's possible that all + * remaining blocks between source and target are * unsuitable and the compaction scanners fail to meet. */ - if (!suitable_migration_source(cc, page)) { + if (pageblock_skip_persistent(page) || + !is_migrate_movable(get_pageblock_migratetype(page))) { update_cached_migrate(cc, block_end_pfn); continue; } @@ -2057,134 +2012,11 @@ static bool should_proactive_compact_node(pg_data_t *pgdat) return fragmentation_score_node(pgdat) > wmark_high; } -static enum compact_result __compact_finished(struct compact_control *cc) -{ - unsigned int order; - const int migratetype = cc->migratetype; - int ret; - - /* Compaction run completes if the migrate and free scanner meet */ - if (compact_scanners_met(cc)) { - /* Let the next compaction start anew. */ - reset_cached_positions(cc->zone); - - /* - * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kcompactd does not set the - * flag itself as the decision to be clear should be directly - * based on an allocation request. - */ - if (cc->direct_compaction) - cc->zone->compact_blockskip_flush = true; - - if (cc->whole_zone) - return COMPACT_COMPLETE; - else - return COMPACT_PARTIAL_SKIPPED; - } - - if (cc->proactive_compaction) { - int score, wmark_low; - pg_data_t *pgdat; - - pgdat = cc->zone->zone_pgdat; - if (kswapd_is_running(pgdat)) - return COMPACT_PARTIAL_SKIPPED; - - score = fragmentation_score_zone(cc->zone); - wmark_low = fragmentation_score_wmark(pgdat, true); - - if (score > wmark_low) - ret = COMPACT_CONTINUE; - else - ret = COMPACT_SUCCESS; - - goto out; - } - - if (is_via_compact_memory(cc->order)) - return COMPACT_CONTINUE; - - /* - * Always finish scanning a pageblock to reduce the possibility of - * fallbacks in the future. This is particularly important when - * migration source is unmovable/reclaimable but it's not worth - * special casing. - */ - if (!pageblock_aligned(cc->migrate_pfn)) - return COMPACT_CONTINUE; - - /* Direct compactor: Is a suitable page free? */ - ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; - - /* Job done if page is free of the right migratetype */ - if (!free_area_empty(area, migratetype)) - return COMPACT_SUCCESS; - -#ifdef CONFIG_CMA - /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ - if (migratetype == MIGRATE_MOVABLE && - !free_area_empty(area, MIGRATE_CMA)) - return COMPACT_SUCCESS; -#endif - /* - * Job done if allocation would steal freepages from - * other migratetype buddy lists. - */ - if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - /* - * Movable pages are OK in any pageblock. If we are - * stealing for a non-movable allocation, make sure - * we finish compacting the current pageblock first - * (which is assured by the above migrate_pfn align - * check) so it is as free as possible and we won't - * have to steal another one soon. - */ - return COMPACT_SUCCESS; - } - -out: - if (cc->contended || fatal_signal_pending(current)) - ret = COMPACT_CONTENDED; - - return ret; -} - -static enum compact_result compact_finished(struct compact_control *cc) -{ - int ret; - - ret = __compact_finished(cc); - trace_mm_compaction_finished(cc->zone, cc->order, ret); - if (ret == COMPACT_NO_SUITABLE_PAGE) - ret = COMPACT_CONTINUE; - - return ret; -} - static enum compact_result __compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; - - if (is_via_compact_memory(order)) - return COMPACT_CONTINUE; - - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - /* - * If watermarks for high-order allocation are already met, there - * should be no need for compaction at all. - */ - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) - return COMPACT_SUCCESS; - /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the @@ -2217,14 +2049,18 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * COMPACT_CONTINUE - If compaction should run now */ enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx) { + unsigned long free_pages; enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, - zone_page_state(zone, NR_FREE_PAGES)); + /* Suitable migration targets */ + free_pages = zone_page_state(zone, NR_FREE_MOVABLE); + free_pages += zone_page_state(zone, NR_FREE_CMA_PAGES); + + ret = __compaction_suitable(zone, order, highest_zoneidx, free_pages); + /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -2267,25 +2103,117 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - enum compact_result compact_result; + available = zone_page_state_snapshot(zone, NR_FREE_MOVABLE); + available += zone_page_state_snapshot(zone, NR_FREE_CMA_PAGES); /* * Do not consider all the reclaimable memory because we do not * want to trash just for a single high order allocation which * is even not guaranteed to appear even if __compaction_suitable * is happy about the watermark check. */ - available = zone_reclaimable_pages(zone) / order; - available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - compact_result = __compaction_suitable(zone, order, alloc_flags, - ac->highest_zoneidx, available); - if (compact_result == COMPACT_CONTINUE) + available += zone_reclaimable_pages(zone) / order; + + if (__compaction_suitable(zone, order, ac->highest_zoneidx, + available) == COMPACT_CONTINUE) return true; } return false; } +static enum compact_result __compact_finished(struct compact_control *cc) +{ + unsigned long mark; + int ret; + + /* Compaction run completes if the migrate and free scanner meet */ + if (compact_scanners_met(cc)) { + /* Let the next compaction start anew. */ + reset_cached_positions(cc->zone); + + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kcompactd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (cc->direct_compaction) + cc->zone->compact_blockskip_flush = true; + + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; + } + + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + + if (is_via_compact_memory(cc->order)) + return COMPACT_CONTINUE; + + /* + * Always finish scanning a pageblock to reduce the possibility of + * fallbacks in the future. This is particularly important when + * migration source is unmovable/reclaimable but it's not worth + * special casing. + */ + if (!pageblock_aligned(cc->migrate_pfn)) + return COMPACT_CONTINUE; + + /* Done when watermarks are restored */ + ret = COMPACT_NO_SUITABLE_PAGE; + if (cc->direct_compaction) + mark = wmark_pages(cc->zone, cc->alloc_flags & ALLOC_WMARK_MASK); + else + mark = high_wmark_pages(cc->zone); + if (zone_watermark_ok(cc->zone, cc->order, mark, cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + /* + * In the process of neutralizing blocks, compaction reduces + * the amount of migration targets. Re-check availability. + */ + if (compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx) == COMPACT_SKIPPED) + return COMPACT_SKIPPED; + +out: + if (cc->contended || fatal_signal_pending(current)) + ret = COMPACT_CONTENDED; + + return ret; +} + +static enum compact_result compact_finished(struct compact_control *cc) +{ + int ret; + + ret = __compact_finished(cc); + trace_mm_compaction_finished(cc->zone, cc->order, ret); + if (ret == COMPACT_NO_SUITABLE_PAGE) + ret = COMPACT_CONTINUE; + + return ret; +} + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { @@ -2309,14 +2237,30 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); - ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->highest_zoneidx); - /* Compaction is likely to fail */ - if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) - return ret; - /* huh, compaction_suitable is returning something unexpected */ - VM_BUG_ON(ret != COMPACT_CONTINUE); + if (!is_via_compact_memory(cc->order)) { + unsigned long watermark; + + /* Allocation can already succeed, nothing to do */ + if (cc->direct_compaction) + watermark = wmark_pages(cc->zone, + cc->alloc_flags & + ALLOC_WMARK_MASK); + else + watermark = high_wmark_pages(cc->zone); + if (zone_watermark_ok(cc->zone, cc->order, watermark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + ret = compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx); + /* Compaction is likely to fail */ + if (ret == COMPACT_SKIPPED) + return ret; + + /* huh, compaction_suitable is returning something unexpected */ + VM_BUG_ON(ret != COMPACT_CONTINUE); + } /* * Clear pageblock skip if there were failures recently and compaction @@ -2499,7 +2443,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, unsigned int alloc_flags, int highest_zoneidx, - struct page **capture) + struct capture_control *capc) { enum compact_result ret; struct compact_control cc = { @@ -2507,8 +2451,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .search_order = order, .gfp_mask = gfp_mask, .zone = zone, - .mode = (prio == COMPACT_PRIO_ASYNC) ? - MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, .highest_zoneidx = highest_zoneidx, .direct_compaction = true, @@ -2516,38 +2458,31 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; - struct capture_control capc = { - .cc = &cc, - .page = NULL, - }; - /* - * Make sure the structs are really initialized before we expose the - * capture control, in case we are interrupted and the interrupt handler - * frees a page. - */ + /* Use trylocks in migration if this is a filesystem allocation */ + if (prio == COMPACT_PRIO_ASYNC || !(gfp_mask & __GFP_FS)) + cc.mode = MIGRATE_ASYNC; + else + cc.mode = MIGRATE_SYNC_LIGHT; + + /* See the comment in __alloc_pages_direct_compact() */ barrier(); - WRITE_ONCE(current->capture_control, &capc); + WRITE_ONCE(capc->cc, &cc); - ret = compact_zone(&cc, &capc); + ret = compact_zone(&cc, capc); + + WRITE_ONCE(capc->cc, NULL); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - /* - * Make sure we hide capture control first before we read the captured - * page pointer, otherwise an interrupt could free and capture a page - * and we would leak it. - */ - WRITE_ONCE(current->capture_control, NULL); - *capture = READ_ONCE(capc.page); /* * Technically, it is also possible that compaction is skipped but * the page is still captured out of luck(IRQ came and freed the page). * Returning COMPACT_SUCCESS in such cases helps in properly accounting * the COMPACT[STALL|FAIL] when compaction is skipped. */ - if (*capture) + if (capc->page) ret = COMPACT_SUCCESS; return ret; @@ -2562,13 +2497,13 @@ int sysctl_extfrag_threshold = 500; * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation * @prio: Determines how hard direct compaction should try to succeed - * @capture: Pointer to free page created by compaction will be stored here + * @capc: The context for capturing pages during freeing * * This is the main entry point for direct page compaction. */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum compact_priority prio, struct page **capture) + enum compact_priority prio, struct capture_control *capc) { int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; @@ -2596,7 +2531,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac->highest_zoneidx, capture); + alloc_flags, ac->highest_zoneidx, capc); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2803,7 +2738,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + /* Allocation can succeed in any zone, done */ + if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, + high_wmark_pages(zone), + highest_zoneidx, 0)) + return true; + + /* Allocation can't succed, but enough order-0 to compact */ + if (compaction_suitable(zone, pgdat->kcompactd_max_order, highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2841,8 +2783,13 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (compaction_suitable(zone, cc.order, 0, zoneid) != - COMPACT_CONTINUE) + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, cc.order, + high_wmark_pages(zone), zoneid, 0)) + continue; + + if (compaction_suitable(zone, cc.order, + zoneid) != COMPACT_CONTINUE) continue; if (kthread_should_stop()) diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032dea..1c0886c3ce0efb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -432,6 +432,8 @@ struct compact_control { */ struct capture_control { struct compact_control *cc; + int order; + int migratetype; struct page *page; }; @@ -739,7 +741,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ -#ifdef CONFIG_ZONE_DMA32 +#if defined(CONFIG_ZONE_DMA32) && !defined(CONFIG_COMPACTION) #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 @@ -776,16 +778,6 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -static inline bool is_migrate_highatomic(enum migratetype migratetype) -{ - return migratetype == MIGRATE_HIGHATOMIC; -} - -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f17635..d7b9f0e70b5846 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1129,7 +1129,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_FREE); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1951,7 +1951,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, failed_removal_isolated: /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_FREE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: lru_cache_enable(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bb3484563eda5..9bd86ee39ab157 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -379,7 +379,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", "Reclaimable", - "HighAtomic", + "Free", #ifdef CONFIG_CMA "CMA", #endif @@ -401,7 +401,6 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; @@ -615,6 +614,17 @@ void set_pageblock_migratetype(struct page *page, int migratetype) page_to_pfn(page), MIGRATETYPE_MASK); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -710,7 +720,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - return NR_LOWORDER_PCP_LISTS; + return NR_LOWORDER_PCP_LISTS + migratetype; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -724,7 +734,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex == NR_LOWORDER_PCP_LISTS) + if (pindex >= NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -843,7 +853,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order { if (!debug_guardpage_enabled()) return false; @@ -854,15 +864,12 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return; @@ -870,14 +877,12 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __ClearPageGuard(page); set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } + unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} + unsigned int order) {} #endif /* @@ -944,74 +949,40 @@ static inline void set_buddy_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -#ifdef CONFIG_COMPACTION -static inline struct capture_control *task_capc(struct zone *zone) +static inline void account_freepages(struct page *page, struct zone *zone, + int nr_pages, int migratetype) { - struct capture_control *capc = current->capture_control; - - return unlikely(capc) && - !(current->flags & PF_KTHREAD) && - !capc->page && - capc->cc->zone == zone ? capc : NULL; -} - -static inline bool -compaction_capture(struct capture_control *capc, struct page *page, - int order, int migratetype) -{ - if (!capc || order != capc->cc->order) - return false; - - /* Do not accidentally pollute CMA or isolated regions*/ - if (is_migrate_cma(migratetype) || - is_migrate_isolate(migratetype)) - return false; - - /* - * Do not let lower order allocations pollute a movable pageblock. - * This might let an unmovable request use a reclaimable pageblock - * and vice-versa but no more than normal fallback logic which can - * have trouble finding a high-order free page. - */ - if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) - return false; - - capc->page = page; - return true; -} + if (is_migrate_isolate(migratetype)) + return; -#else -static inline struct capture_control *task_capc(struct zone *zone) -{ - return NULL; -} + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); -static inline bool -compaction_capture(struct capture_control *capc, struct page *page, - int order, int migratetype) -{ - return false; + if (migratetype <= MIGRATE_FREE) + __mod_zone_page_state(zone, NR_FREE_UNMOVABLE + migratetype, nr_pages); + else if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); + else + VM_WARN_ONCE(1, "unexpected migratetype %d\n", migratetype); } -#endif /* CONFIG_COMPACTION */ /* Used for pages not on another list */ static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; -} - -/* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) -{ - struct free_area *area = &zone->free_area[order]; + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; + + account_freepages(page, zone, 1 << order, migratetype); } /* @@ -1020,16 +991,23 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(page, zone, -(1 << order), old_mt); + account_freepages(page, zone, 1 << order, new_mt); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -1038,8 +1016,106 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; + + account_freepages(page, zone, -(1 << order), migratetype); } +/* + * The order of subdivision here is critical for the IO subsystem. + * Please do not alter this order without good reasons and regression + * testing. Specifically, as large blocks of memory are subdivided, + * the order in which smaller blocks are delivered depends on the order + * they're subdivided in this function. This is the primary factor + * influencing the order in which pages are delivered to the IO + * subsystem according to empirical testing, and this is also justified + * by considering the behavior of a buddy system containing a single + * large block of memory acted on by a series of small allocations. + * This behavior is a critical factor in sglist merging's success. + * + * -- nyc + */ +static inline void expand(struct zone *zone, struct page *page, + int low, int high, int migratetype) +{ + unsigned long size = 1 << high; + + while (high > low) { + high--; + size >>= 1; + VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); + + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high)) + continue; + + add_to_free_list(&page[size], zone, high, migratetype, false); + set_buddy_order(&page[size], high); + } +} + +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return unlikely(capc && capc->cc) && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone ? capc : NULL; +} + +static inline bool +compaction_capture(struct zone *zone, struct page *page, int order, + int migratetype, struct capture_control *capc) +{ + if (!capc || order < capc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + if (order >= pageblock_order) { + migratetype = capc->migratetype; + change_pageblock_range(page, order, migratetype); + } else if (migratetype == MIGRATE_MOVABLE) { + /* + * Do not let lower order allocations pollute a + * movable pageblock. This might let an unmovable + * request use a reclaimable pageblock and vice-versa + * but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + return false; + } + + if (order > capc->order) + expand(zone, page, capc->order, order, migratetype); + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct zone *zone, struct page *page, int order, + int migratetype, struct capture_control *capc) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + /* * If this is not the largest possible page, check if the buddy * of the next-highest order is free. If it is, it's possible @@ -1104,32 +1180,28 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); while (order < MAX_ORDER - 1) { - if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + int buddy_mt; + + if (compaction_capture(zone, page, order, migratetype, capc)) return; - } buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); if (!buddy) goto done_merging; + buddy_mt = get_pageblock_migratetype(buddy); + if (unlikely(order >= pageblock_order)) { /* * We want to prevent merge between freepages on pageblock * without fallbacks and normal pageblock. Without this, * pageblock isolation could cause incorrect freepage or CMA - * accounting or HIGHATOMIC accounting. + * accounting. */ - int buddy_mt = get_pageblock_migratetype(buddy); - if (migratetype != buddy_mt && (!migratetype_is_mergeable(migratetype) || !migratetype_is_mergeable(buddy_mt))) @@ -1141,9 +1213,9 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + del_page_from_free_list(buddy, zone, order, buddy_mt); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -1153,6 +1225,13 @@ static inline void __free_one_page(struct page *page, done_merging: set_buddy_order(page, order); + /* If we freed one or normal page blocks, mark them free. */ + if (unlikely(order >= pageblock_order && + migratetype_is_mergeable(migratetype))) { + change_pageblock_range(page, order, MIGRATE_FREE); + migratetype = MIGRATE_FREE; + } + if (fpi_flags & FPI_TO_TAIL) to_tail = true; else if (is_shuffle_order(order)) @@ -1160,10 +1239,7 @@ static inline void __free_one_page(struct page *page, else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) @@ -1205,10 +1281,8 @@ int split_free_page(struct page *free_page, } mt = get_pageblock_migratetype(free_page); - if (likely(!is_migrate_isolate(mt))) - __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(free_page, zone, order, mt); - del_page_from_free_list(free_page, zone, order); for (pfn = free_page_pfn; pfn < free_page_pfn + (1UL << order);) { int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); @@ -1897,14 +1971,14 @@ static void __init deferred_free_range(unsigned long pfn, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, MIGRATE_FREE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, MIGRATE_FREE); __free_pages_core(page, 0); } } @@ -2322,44 +2396,6 @@ void __init init_cma_reserved_pageblock(struct page *page) } #endif -/* - * The order of subdivision here is critical for the IO subsystem. - * Please do not alter this order without good reasons and regression - * testing. Specifically, as large blocks of memory are subdivided, - * the order in which smaller blocks are delivered depends on the order - * they're subdivided in this function. This is the primary factor - * influencing the order in which pages are delivered to the IO - * subsystem according to empirical testing, and this is also justified - * by considering the behavior of a buddy system containing a single - * large block of memory acted on by a series of small allocations. - * This behavior is a critical factor in sglist merging's success. - * - * -- nyc - */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) -{ - unsigned long size = 1 << high; - - while (high > low) { - high--; - size >>= 1; - VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - if (set_page_guard(zone, &page[size], high, migratetype)) - continue; - - add_to_free_list(&page[size], zone, high, migratetype); - set_buddy_order(&page[size], high); - } -} - static void check_new_page_bad(struct page *page) { if (unlikely(page->flags & __PG_HWPOISON)) { @@ -2559,11 +2595,15 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { + int actual_mt; + area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); + /* move_freepages_block() may strand types on wrong list */ + actual_mt = get_pageblock_migratetype(page); + del_page_from_free_list(page, zone, current_order, actual_mt); expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, @@ -2582,11 +2622,19 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +#ifdef CONFIG_COMPACTION +static int fallbacks[MIGRATE_TYPES][2] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_FREE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_FREE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_FREE, MIGRATE_TYPES }, }; +#else +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_FREE, MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_FREE, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_FREE, MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +}; +#endif #ifdef CONFIG_CMA static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, @@ -2606,7 +2654,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, */ static int move_freepages(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int *num_movable) + int old_mt, int new_mt, int *num_movable) { struct page *page; unsigned long pfn; @@ -2633,7 +2681,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + move_to_free_list(page, zone, order, old_mt, new_mt); pfn += 1 << order; pages_moved += 1 << order; } @@ -2642,7 +2690,7 @@ static int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable) + int old_mt, int new_mt, int *num_movable) { unsigned long start_pfn, end_pfn, pfn; @@ -2659,19 +2707,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_pfn, end_pfn, migratetype, - num_movable); -} - -static void change_pageblock_range(struct page *pageblock_page, - int start_order, int migratetype) -{ - int nr_pageblocks = 1 << (start_order - pageblock_order); - - while (nr_pageblocks--) { - set_pageblock_migratetype(pageblock_page, migratetype); - pageblock_page += pageblock_nr_pages; - } + return move_freepages(zone, start_pfn, end_pfn, + old_mt, new_mt, num_movable); } /* @@ -2686,8 +2723,13 @@ static void change_pageblock_range(struct page *pageblock_page, * is worse than movable allocations stealing from unmovable and reclaimable * pageblocks. */ -static bool can_steal_fallback(unsigned int order, int start_mt) +static bool can_steal_fallback(unsigned int order, int start_mt, + int fallback_mt) { + /* The first allocation in a free block *must* claim it. */ + if (fallback_mt == MIGRATE_FREE) + return true; + /* * Leaving this order check is intended, although there is * relaxed order check in next check. The reason is that @@ -2707,43 +2749,6 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline bool boost_watermark(struct zone *zone) -{ - unsigned long max_boost; - - if (!watermark_boost_factor) - return false; - /* - * Don't bother in zones that are unlikely to produce results. - * On small machines, including kdump capture kernels running - * in a small area, boosting the watermark can cause an out of - * memory situation immediately. - */ - if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return false; - - max_boost = mult_frac(zone->_watermark[WMARK_HIGH], - watermark_boost_factor, 10000); - - /* - * high watermark may be uninitialised if fragmentation occurs - * very early in boot so do not boost. We do not fall - * through and boost by pageblock_nr_pages as failing - * allocations that early means that reclaim is not going - * to help and it may even be impossible to reclaim the - * boosted watermark resulting in a hang. - */ - if (!max_boost) - return false; - - max_boost = max(pageblock_nr_pages, max_boost); - - zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, - max_boost); - - return true; -} - /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this @@ -2761,33 +2766,34 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, old_block_type = get_pageblock_migratetype(page); - /* - * This can happen due to races and we want to prevent broken - * highatomic accounting. - */ - if (is_migrate_highatomic(old_block_type)) - goto single_page; - /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); goto single_page; } - /* - * Boost watermarks to increase reclaim pressure to reduce the - * likelihood of future fallbacks. Wake kswapd now as the node - * may be balanced overall and kswapd will not wake naturally. - */ - if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) - set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - /* We are not allowed to try stealing from the whole block */ if (!whole_block) goto single_page; - free_pages = move_freepages_block(zone, page, start_type, - &movable_pages); + free_pages = move_freepages_block(zone, page, old_block_type, + start_type, &movable_pages); + + /* + * If we fell back into a free block, claim the whole thing + */ + if (old_block_type == MIGRATE_FREE) { + set_pageblock_migratetype(page, start_type); + if (!free_pages) { + /* + * This can leave some non-FREE pages on the + * FREE list. Future fallbacks will get them. + */ + goto single_page; + } + return; + } + /* * Determine how many pages are compatible with our allocation. * For movable allocation, it's the number of movable pages which @@ -2825,7 +2831,8 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, return; single_page: - move_to_free_list(page, zone, current_order, start_type); + move_to_free_list(page, zone, current_order, + old_block_type, start_type); } /* @@ -2852,7 +2859,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (free_area_empty(area, fallback_mt)) continue; - if (can_steal_fallback(order, migratetype)) + if (can_steal_fallback(order, migratetype, fallback_mt)) *can_steal = true; if (!only_stealable) @@ -2865,124 +2872,6 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } -/* - * Reserve a pageblock for exclusive use of high-order atomic allocations if - * there are no empty page blocks that contain a page with a suitable order - */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, - unsigned int alloc_order) -{ - int mt; - unsigned long max_managed, flags; - - /* - * Limit the number reserved to 1 pageblock or roughly 1% of a zone. - * Check is race-prone but harmless. - */ - max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; - if (zone->nr_reserved_highatomic >= max_managed) - return; - - spin_lock_irqsave(&zone->lock, flags); - - /* Recheck the nr_reserved_highatomic limit under the lock */ - if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; - - /* Yoink! */ - mt = get_pageblock_migratetype(page); - /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) { - zone->nr_reserved_highatomic += pageblock_nr_pages; - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); - } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); -} - -/* - * Used when an allocation is about to fail under memory pressure. This - * potentially hurts the reliability of high-order allocations when under - * intense memory pressure but failed atomic allocations should be easier - * to recover from than an OOM. - * - * If @force is true, try to unreserve a pageblock even though highatomic - * pageblock is exhausted. - */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, - bool force) -{ - struct zonelist *zonelist = ac->zonelist; - unsigned long flags; - struct zoneref *z; - struct zone *zone; - struct page *page; - int order; - bool ret; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, - ac->nodemask) { - /* - * Preserve at least one pageblock unless memory pressure - * is really high. - */ - if (!force && zone->nr_reserved_highatomic <= - pageblock_nr_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - struct free_area *area = &(zone->free_area[order]); - - page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); - if (!page) - continue; - - /* - * In page freeing path, migratetype change is racy so - * we can counter several free pages in a pageblock - * in this loop although we changed the pageblock type - * from highatomic to ac->migratetype. So we should - * adjust the count once. - */ - if (is_migrate_highatomic_page(page)) { - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - zone->nr_reserved_highatomic -= min( - pageblock_nr_pages, - zone->nr_reserved_highatomic); - } - - /* - * Convert to ac->migratetype and avoid the normal - * pageblock stealing heuristics. Minimally, the caller - * is doing the work and needs the pages. More - * importantly, if the block was always converted to - * MIGRATE_UNMOVABLE or another type then the number - * of pageblocks that cannot be completely freed - * may increase. - */ - set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype, - NULL); - if (ret) { - spin_unlock_irqrestore(&zone->lock, flags); - return ret; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } - - return false; -} - /* * Try finding a free buddy page on the fallback list and put it on the free * list of requested migratetype, possibly along with other pages from the same @@ -3142,18 +3031,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ list_add_tail(&page->pcp_list, list); allocated++; - if (is_migrate_cma(get_pcppage_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - /* - * i pages were removed from the buddy list even if some leak due - * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'allocated' which is the number of - * pages added to the pcp list. - */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return allocated; } @@ -3466,18 +3344,11 @@ void free_unref_page(struct page *page, unsigned int order) /* * We only track unmovable, reclaimable and movable on pcp lists. - * Place ISOLATE pages on the isolated list because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free - * excessively into the page allocator */ migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { - if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); - return; - } - migratetype = MIGRATE_MOVABLE; + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + return; } zone = page_zone(page); @@ -3517,7 +3388,7 @@ void free_unref_page_list(struct list_head *list) * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(page); - if (unlikely(is_migrate_isolate(migratetype))) { + if (unlikely(is_migrate_isolate(migratetype) || migratetype == MIGRATE_FREE)) { list_del(&page->lru); free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; @@ -3605,27 +3476,25 @@ int __isolate_free_page(struct page *page, unsigned int order) int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { + long free_pages = zone_page_state(zone, NR_FREE_PAGES); unsigned long watermark; /* - * Obey watermarks as if the page was being allocated. We can - * emulate a high-order watermark check with a raised order-0 - * watermark, because we already know our high-order page - * exists. + * Keep a lid on concurrent compaction. MIGRATE_FREE + * watermarks alone cannot be checked here, because + * that's what the caller is trying to produce. */ watermark = zone->_watermark[WMARK_MIN] + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!__zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA, free_pages)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* - * Set the pageblock if the isolated page is at least half of a - * pageblock + * Set the pageblock if the isolated page is from a free block + * or at least half of a pageblock */ - if (order >= pageblock_order - 1) { + if (mt == MIGRATE_FREE || order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); @@ -3698,26 +3567,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, unsigned long flags; do { - page = NULL; spin_lock_irqsave(&zone->lock, flags); - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ - if (order > 0 && alloc_flags & ALLOC_HARDER) - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); - if (!page) { - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; - } - } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); + page = __rmqueue(zone, order, migratetype, alloc_flags); spin_unlock_irqrestore(&zone->lock, flags); + if (!page) + return NULL; } while (check_new_pages(page, order)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); @@ -3821,6 +3675,7 @@ struct page *rmqueue(struct zone *preferred_zone, int migratetype) { struct page *page; + int buddy = 0; /* * We most definitely don't want callers attempting to @@ -3844,15 +3699,14 @@ struct page *rmqueue(struct zone *preferred_zone, page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, migratetype); + buddy = 1; out: - /* Separate test+clear to avoid unnecessary atomics */ - if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { - clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - wakeup_kswapd(zone, 0, 0, zone_idx(zone)); - } - VM_BUG_ON_PAGE(page && bad_range(zone, page), page); + VM_WARN_ONCE(page && get_pageblock_migratetype(page) != migratetype, + "%d:%s order=%u gfp=%pGg mt=%s alloc_flags=%x buddy=%d\n", + zone_to_nid(zone), zone->name, order, &gfp_flags, + migratetype_names[migratetype], alloc_flags, buddy); return page; } @@ -3936,27 +3790,57 @@ noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); -static inline long __zone_watermark_unusable_free(struct zone *z, - unsigned int order, unsigned int alloc_flags) +static long page_state(struct zone *zone, enum zone_stat_item item, bool safe) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); - long unusable_free = (1 << order) - 1; + if (safe) + return zone_page_state_snapshot(zone, item); + else + return zone_page_state(zone, item); +} + +static long __zone_free_pages(struct zone *zone, int alloc_flags, bool safe) +{ + long free_pages; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * Enforce all watermarks against MIGRATE_FREE pages. + * + * It'd be possible to check against MIGRATE_FREE plus the + * free pages inside blocks that are compatible with the + * allocation. However, this would complicate reclaim, and + * care would have to be taken such that type fallbacks don't + * result in one allocation type eating the physical reserves + * of another while complying with its own watermarks. + * + * Since watermarks are a very small percentage of overall + * memory, and reclaim wouldn't be far away anyway once the + * neutral blocks have been consumed, let's keep it simple. */ - if (likely(!alloc_harder)) - unusable_free += z->nr_reserved_highatomic; + free_pages = page_state(zone, NR_FREE_FREE, safe); + if (IS_ENABLED(CONFIG_CMA) && (alloc_flags & ALLOC_CMA)) + free_pages += page_state(zone, NR_FREE_CMA_PAGES, safe); -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); -#endif + if (!IS_ENABLED(CONFIG_COMPACTION)) { + /* + * We can't reasonably defragment without compaction. + * Consider everything and do best-effort grouping. + */ + free_pages += page_state(zone, NR_FREE_UNMOVABLE, safe); + free_pages += page_state(zone, NR_FREE_MOVABLE, safe); + free_pages += page_state(zone, NR_FREE_RECLAIMABLE, safe); + } - return unusable_free; + return free_pages; +} + +static long zone_free_pages(struct zone *zone, int alloc_flags) +{ + return __zone_free_pages(zone, alloc_flags, false); +} + +static long zone_free_pages_safe(struct zone *zone, int alloc_flags) +{ + return __zone_free_pages(zone, alloc_flags, true); } /* @@ -3974,7 +3858,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ - free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; @@ -4012,6 +3896,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; + if (!free_area_empty(area, MIGRATE_FREE)) + return true; + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!free_area_empty(area, mt)) return true; @@ -4023,8 +3910,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) - return true; } return false; } @@ -4033,49 +3918,26 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, - zone_page_state(z, NR_FREE_PAGES)); + zone_free_pages(z, alloc_flags)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, gfp_t gfp_mask) { - long free_pages; - - free_pages = zone_page_state(z, NR_FREE_PAGES); + long free_pages = zone_free_pages(z, alloc_flags); /* * Fast check for order-0 only. If this fails then the reserves * need to be calculated. */ - if (!order) { - long usable_free; - long reserved; - - usable_free = free_pages; - reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); - - /* reserved may over estimate high-atomic reserves. */ - usable_free -= min(usable_free, reserved); - if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) - return true; - } + if (!order && (free_pages - ((1 << order) - 1) > + mark + z->lowmem_reserve[highest_zoneidx])) + return true; if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; - /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations - * when checking the min watermark. The min watermark is the - * point where boosting is ignored so that kswapd is woken up - * when below the low watermark. - */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost - && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { - mark = z->_watermark[WMARK_MIN]; - return __zone_watermark_ok(z, order, mark, highest_zoneidx, - alloc_flags, free_pages); - } return false; } @@ -4083,13 +3945,8 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx) { - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); + zone_free_pages_safe(z, 0)); } #ifdef CONFIG_NUMA @@ -4281,14 +4138,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); - - /* - * If this is a high-order atomic allocation then check - * if the pageblock should be reserved for the future - */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) - reserve_highatomic_pageblock(page, zone, order); - return page; } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -4480,22 +4329,44 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned long pflags; unsigned int noreclaim_flag; + struct capture_control capc = { + .order = order, + .migratetype = ac->migratetype, + .page = NULL, + }; + int compact_order; - if (!order) - return NULL; + /* Use reclaim/compaction to produce neutral blocks */ + compact_order = max_t(int, order, pageblock_order); + + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); psi_memstall_enter(&pflags); delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); - *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio, &page); + *compact_result = try_to_compact_pages(gfp_mask, compact_order, + alloc_flags, ac, prio, &capc); memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); delayacct_compact_end(); - if (*compact_result == COMPACT_SKIPPED) + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + page = READ_ONCE(capc.page); + + if (!page && *compact_result == COMPACT_SKIPPED) return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -4537,14 +4408,12 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_priority *compact_priority, int *compaction_retries) { - int max_retries = MAX_COMPACT_RETRIES; - int min_priority; bool ret = false; int retries = *compaction_retries; enum compact_priority priority = *compact_priority; - if (!order) - return false; + /* Use reclaim/compaction to produce neutral blocks */ + order = max_t(int, order, pageblock_order); if (fatal_signal_pending(current)) return false; @@ -4579,17 +4448,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, goto check_priority; } - /* - * !costly requests are much more important than __GFP_RETRY_MAYFAIL - * costly ones because they are de facto nofail and invoke OOM - * killer to move on while costly can fail and users are ready - * to cope with that. 1/4 retries is rather arbitrary but we - * would need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - if (*compaction_retries <= max_retries) { + if (*compaction_retries <= MAX_COMPACT_RETRIES) { ret = true; goto out; } @@ -4599,16 +4458,13 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * all retries or failed at the lower priorities. */ check_priority: - min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? - MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; - - if (*compact_priority > min_priority) { + if (*compact_priority > MIN_COMPACT_PRIORITY) { (*compact_priority)--; *compaction_retries = 0; ret = true; } out: - trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + trace_compact_retry(order, priority, compact_result, retries, MAX_COMPACT_RETRIES, ret); return ret; } #else @@ -4767,9 +4623,16 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned long pflags; bool drained = false; + int reclaim_order; + + /* Use reclaim/compaction to produce neutral blocks */ + if (IS_ENABLED(CONFIG_COMPACTION)) + reclaim_order = max_t(int, order, pageblock_order); + else + reclaim_order = order; psi_memstall_enter(&pflags); - *did_some_progress = __perform_reclaim(gfp_mask, order, ac); + *did_some_progress = __perform_reclaim(gfp_mask, reclaim_order, ac); if (unlikely(!(*did_some_progress))) goto out; @@ -4782,7 +4645,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, * Shrink them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -4801,6 +4663,10 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, pg_data_t *last_pgdat = NULL; enum zone_type highest_zoneidx = ac->highest_zoneidx; + /* Use reclaim/compaction to produce neutral blocks */ + if (IS_ENABLED(CONFIG_COMPACTION)) + order = max_t(unsigned int, order, pageblock_order); + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (!managed_zone(zone)) @@ -4929,10 +4795,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) { - /* Before OOM, exhaust highatomic_reserve */ - return unreserve_highatomic_pageblock(ac, true); - } + if (*no_progress_loops > MAX_RECLAIM_RETRIES) + return false; /* * Keep reclaiming pages while there is a chance this will lead @@ -6044,7 +5908,7 @@ static void show_migration_types(unsigned char type) [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_MOVABLE] = 'M', [MIGRATE_RECLAIMABLE] = 'E', - [MIGRATE_HIGHATOMIC] = 'H', + [MIGRATE_FREE] = 'F', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -6107,7 +5971,9 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i " mapped:%lu shmem:%lu pagetables:%lu\n" " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" - " free:%lu free_pcp:%lu free_cma:%lu\n", + " free:%lu free_unmovable:%lu free_movable:%lu\n" + " free_reclaimable:%lu free_free:%lu\n" + " free_cma:%lu free_pcp:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), global_node_page_state(NR_ISOLATED_ANON), @@ -6126,8 +5992,12 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), - free_pcp, - global_zone_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_UNMOVABLE), + global_zone_page_state(NR_FREE_MOVABLE), + global_zone_page_state(NR_FREE_RECLAIMABLE), + global_zone_page_state(NR_FREE_FREE), + global_zone_page_state(NR_FREE_CMA_PAGES), + free_pcp); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) @@ -6205,11 +6075,13 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i printk(KERN_CONT "%s" " free:%lukB" - " boost:%lukB" + " free_unmovable:%lukB" + " free_movable:%lukB" + " free_reclaimable:%lukB" + " free_free:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" - " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -6226,11 +6098,13 @@ void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_i "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->watermark_boost), + K(zone_page_state(zone, NR_FREE_UNMOVABLE)), + K(zone_page_state(zone, NR_FREE_MOVABLE)), + K(zone_page_state(zone, NR_FREE_RECLAIMABLE)), + K(zone_page_state(zone, NR_FREE_FREE)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -6989,7 +6863,7 @@ static void __init memmap_init_zone_range(struct zone *zone, return; memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, - zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_FREE); if (*hole_pfn < start_pfn) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); @@ -7626,7 +7500,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER - 1; + unsigned int order = ilog2(2U << (20 - PAGE_SHIFT)); /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -8760,7 +8634,13 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); - zone->watermark_boost = 0; + /* + * Ensure the watermark delta is a multiple of the + * neutral block that reclaim/compaction produces. + */ + if (IS_ENABLED(CONFIG_COMPACTION)) + tmp = ALIGN(tmp, 1 << pageblock_order); + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; @@ -9386,8 +9266,7 @@ static int __alloc_contig_pages(unsigned long start_pfn, { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_FREE, gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, @@ -9570,8 +9449,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -9622,11 +9502,12 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, current_buddy = page + size; } - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; if (current_buddy != target) { - add_to_free_list(current_buddy, zone, high, migratetype); + add_to_free_list(current_buddy, zone, high, + migratetype, false); set_buddy_order(current_buddy, high); page = next_page; } @@ -9654,12 +9535,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 47fbc1696466f0..e119a37ac661ce 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -183,10 +183,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, - NULL); - - __mod_zone_freepage_state(zone, -nr_pages, mt); + nr_pages = move_freepages_block(zone, page, mt, + MIGRATE_ISOLATE, NULL); spin_unlock_irqrestore(&zone->lock, flags); return 0; } @@ -251,10 +249,9 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype, NULL); - __mod_zone_freepage_state(zone, nr_pages, migratetype); - } + if (!isolated_page) + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + migratetype, NULL); set_pageblock_migratetype(page, migratetype); if (isolated_page) __putback_isolated_page(page, order, migratetype); @@ -481,8 +478,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, } /** - * start_isolate_page_range() - make page-allocation-type of range of pages to - * be MIGRATE_ISOLATE. + * start_isolate_page_range() - mark page range MIGRATE_ISOLATE * @start_pfn: The lower PFN of the range to be isolated. * @end_pfn: The upper PFN of the range to be isolated. * @migratetype: Migrate type to set in error recovery. @@ -571,8 +567,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, return 0; } -/* - * Make isolated pages available again. +/** + * undo_isolate_page_range - undo effects of start_isolate_page_range() + * @start_pfn: The lower PFN of the isolated range + * @end_pfn: The upper PFN of the isolated range + * @migratetype: New migrate type to set on the range + * + * This finds every MIGRATE_ISOLATE page block in the given range + * and switches it to @migratetype. */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype) @@ -631,7 +633,21 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } -/* Caller should ensure that requested range is in a single zone */ +/** + * test_pages_isolated - check if pageblocks in range are isolated + * @start_pfn: The first PFN of the isolated range + * @end_pfn: The first PFN *after* the isolated range + * @isol_flags: Testing mode flags + * + * This tests if all in the specified range are free. + * + * If %MEMORY_OFFLINE is specified in @flags, it will consider + * poisoned and offlined pages free as well. + * + * Caller must ensure the requested range doesn't span zones. + * + * Returns 0 if true, -EBUSY if one or more pages are in use. + */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 5b7b8d4f5297f3..5586be6997cddb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6076,14 +6076,14 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_SUCCESS: - case COMPACT_CONTINUE: + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return false; + + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_CONTINUE) return false; - default: - /* check next zone */ - ; - } } /* @@ -6270,14 +6270,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - enum compact_result suitable; + unsigned long free_pages; - suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); - if (suitable == COMPACT_SUCCESS) - /* Allocation should succeed already. Don't reclaim. */ + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) return true; - if (suitable == COMPACT_SKIPPED) - /* Compaction cannot yet proceed. Do reclaim. */ + + /* Compaction cannot yet proceed, might need reclaim */ + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_SKIPPED) return false; /* @@ -6290,8 +6292,10 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * we are already above the high+gap watermark, don't reclaim at all. */ watermark = high_wmark_pages(zone) + compact_gap(sc->order); - - return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); + free_pages = zone_page_state_snapshot(zone, NR_FREE_MOVABLE); + free_pages += zone_page_state_snapshot(zone, NR_FREE_CMA_PAGES); + return __zone_watermark_ok(zone, 0, watermark, sc->reclaim_idx, + ALLOC_CMA, free_pages); } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) @@ -6353,6 +6357,21 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->reclaim_idx = gfp_zone(sc->gfp_mask); } + /* Bail if any of the zones are already compactable */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + sc->reclaim_idx, sc->nodemask) { + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) + continue; + if (compaction_ready(zone, sc)) { + sc->compaction_ready = true; + goto out; + } + } + } + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { /* @@ -6364,22 +6383,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && - sc->order > PAGE_ALLOC_COSTLY_ORDER && - compaction_ready(zone, sc)) { - sc->compaction_ready = true; - continue; - } - /* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a @@ -6416,7 +6419,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (first_pgdat) consider_reclaim_throttle(first_pgdat, sc); - +out: /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -6824,30 +6827,6 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) -{ - int i; - struct zone *zone; - - /* - * Check for watermark boosts top-down as the higher zones - * are more likely to be boosted. Both watermarks and boosts - * should not be checked at the same time as reclaim would - * start prematurely when there is no boosting and a lower - * zone is balanced. - */ - for (i = highest_zoneidx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone->watermark_boost) - return true; - } - - return false; -} - /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx @@ -6868,12 +6847,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) if (!managed_zone(zone)) continue; + /* Allocation can succeed in any zone, done */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = wmark_pages(zone, WMARK_PROMO); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; + + /* Allocation can't succeed, but enough order-0 to compact */ + if (compaction_suitable(zone, order, + highest_zoneidx) == COMPACT_CONTINUE) + return true; } /* @@ -6964,16 +6949,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, */ shrink_node(pgdat, sc); - /* - * Fragmentation may mean that the system cannot be rebalanced for - * high-order allocations. If twice the allocation size has been - * reclaimed then recheck watermarks only at order-0 to prevent - * excessive reclaim. Assume that a process requested a high-order - * can direct reclaim/compact. - */ - if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) - sc->order = 0; - return sc->nr_scanned >= sc->nr_to_reclaim; } @@ -7014,28 +6989,25 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) * that are eligible for use by the caller until at least one zone is * balanced. * - * Returns the order kswapd finished reclaiming at. - * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) +static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; - unsigned long nr_boost_reclaim; - unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; - bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, .may_unmap = 1, + .may_swap = 1, + .may_writepage = !laptop_mode, }; set_task_reclaim_state(current, &sc.reclaim_state); @@ -7044,29 +7016,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) count_vm_event(PAGEOUTRUN); - /* - * Account for the reclaim boost. Note that the zone boost is left in - * place so that parallel allocations that are near the watermark will - * stall or direct reclaim until kswapd is finished. - */ - nr_boost_reclaim = 0; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - nr_boost_reclaim += zone->watermark_boost; - zone_boosts[i] = zone->watermark_boost; - } - boosted = nr_boost_reclaim; - -restart: set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - bool balanced; bool ret; sc.reclaim_idx = highest_zoneidx; @@ -7092,40 +7046,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) } } - /* - * If the pgdat is imbalanced then ignore boosting and preserve - * the watermarks for a later time and restart. Note that the - * zone watermarks will be still reset at the end of balancing - * on the grounds that the normal reclaim should be enough to - * re-evaluate if boosting is required when kswapd next wakes. - */ - balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); - if (!balanced && nr_boost_reclaim) { - nr_boost_reclaim = 0; - goto restart; - } - - /* - * If boosting is not active then only reclaim if there are no - * eligible zones. Note that sc.reclaim_idx is not used as - * buffer_heads_over_limit may have adjusted it. - */ - if (!nr_boost_reclaim && balanced) + if (pgdat_balanced(pgdat, sc.order, highest_zoneidx)) goto out; - /* Limit the priority of boosting to avoid reclaim writeback */ - if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) - raise_priority = false; - - /* - * Do not writeback or swap pages for boosted reclaim. The - * intent is to relieve pressure not issue sub-optimal IO - * from reclaim context. If no pages are reclaimed, the - * reclaim will be aborted. - */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; - sc.may_swap = !nr_boost_reclaim; - /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated @@ -7176,15 +7099,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; - nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); - - /* - * If reclaim made no progress for a boost, stop reclaim as - * IO cannot be queued and it could be an infinite loop in - * extreme circumstances. - */ - if (nr_boost_reclaim && !nr_reclaimed) - break; if (raise_priority || !nr_reclaimed) sc.priority--; @@ -7196,40 +7110,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) out: clear_reclaim_active(pgdat, highest_zoneidx); - /* If reclaim was boosted, account for the reclaim done in this pass */ - if (boosted) { - unsigned long flags; - - for (i = 0; i <= highest_zoneidx; i++) { - if (!zone_boosts[i]) - continue; - - /* Increments are under the zone lock */ - zone = pgdat->node_zones + i; - spin_lock_irqsave(&zone->lock, flags); - zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); - spin_unlock_irqrestore(&zone->lock, flags); - } - - /* - * As there is now likely space, wakeup kcompact to defragment - * pageblocks. - */ - wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); - } - snapshot_refaults(NULL, pgdat); __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); - - /* - * Return the order kswapd stopped reclaiming at as - * prepare_kswapd_sleep() takes it into account. If another caller - * entered the allocator slow path while kswapd was awake, order will - * remain at the higher level. - */ - return sc.order; } /* @@ -7247,7 +7131,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, unsigned int highest_zoneidx) { long remaining = 0; @@ -7265,7 +7149,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -7278,7 +7162,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); remaining = schedule_timeout(HZ/10); @@ -7292,8 +7176,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o kswapd_highest_zoneidx(pgdat, highest_zoneidx)); - if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) - WRITE_ONCE(pgdat->kswapd_order, reclaim_order); + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order); } finish_wait(&pgdat->kswapd_wait, &wait); @@ -7304,8 +7188,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -7346,8 +7229,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order; - unsigned int highest_zoneidx = MAX_NR_ZONES - 1; + unsigned int order, highest_zoneidx; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -7370,22 +7252,20 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); - WRITE_ONCE(pgdat->kswapd_order, 0); + order = 0; + highest_zoneidx = MAX_NR_ZONES - 1; + WRITE_ONCE(pgdat->kswapd_order, order); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); + for ( ; ; ) { bool ret; - alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - highest_zoneidx = kswapd_highest_zoneidx(pgdat, - highest_zoneidx); - -kswapd_try_sleep: - kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - highest_zoneidx); + kswapd_try_to_sleep(pgdat, order, highest_zoneidx); /* Read the new order and highest_zoneidx */ - alloc_order = READ_ONCE(pgdat->kswapd_order); + order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -7411,11 +7291,8 @@ static int kswapd(void *p) * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, - alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, - highest_zoneidx); - if (reclaim_order < alloc_order) - goto kswapd_try_sleep; + order); + balance_pgdat(pgdat, order, highest_zoneidx); } tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); @@ -7456,8 +7333,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, highest_zoneidx) && - !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { + pgdat_balanced(pgdat, order, highest_zoneidx)) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -7465,8 +7341,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ - if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, highest_zoneidx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 1ea6a5ce1c4161..80ee26588242fa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1168,6 +1168,10 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ "nr_free_pages", + "nr_free_unmovable", + "nr_free_movable", + "nr_free_reclaimable", + "nr_free_free", "nr_zone_inactive_anon", "nr_zone_active_anon", "nr_zone_inactive_file", @@ -1678,7 +1682,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n pages free %lu" - "\n boost %lu" "\n min %lu" "\n low %lu" "\n high %lu" @@ -1687,7 +1690,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n managed %lu" "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->watermark_boost, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone),