Merge tag 'bitmap-for-6.9' of https://github.com/norov/linux
[sfrench/cifs-2.6.git] / mm / hugetlb_vmemmap.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * HugeTLB Vmemmap Optimization (HVO)
4  *
5  * Copyright (c) 2020, ByteDance. All rights reserved.
6  *
7  *     Author: Muchun Song <songmuchun@bytedance.com>
8  *
9  * See Documentation/mm/vmemmap_dedup.rst
10  */
11 #define pr_fmt(fmt)     "HugeTLB: " fmt
12
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <linux/mmdebug.h>
17 #include <linux/pagewalk.h>
18 #include <asm/pgalloc.h>
19 #include <asm/tlbflush.h>
20 #include "hugetlb_vmemmap.h"
21
22 /**
23  * struct vmemmap_remap_walk - walk vmemmap page table
24  *
25  * @remap_pte:          called for each lowest-level entry (PTE).
26  * @nr_walked:          the number of walked pte.
27  * @reuse_page:         the page which is reused for the tail vmemmap pages.
28  * @reuse_addr:         the virtual address of the @reuse_page page.
29  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
30  *                      or is mapped from.
31  * @flags:              used to modify behavior in vmemmap page table walking
32  *                      operations.
33  */
34 struct vmemmap_remap_walk {
35         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
36                                              struct vmemmap_remap_walk *walk);
37         unsigned long           nr_walked;
38         struct page             *reuse_page;
39         unsigned long           reuse_addr;
40         struct list_head        *vmemmap_pages;
41
42 /* Skip the TLB flush when we split the PMD */
43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
44 /* Skip the TLB flush when we remap the PTE */
45 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
46         unsigned long           flags;
47 };
48
49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
50                              struct vmemmap_remap_walk *walk)
51 {
52         pmd_t __pmd;
53         int i;
54         unsigned long addr = start;
55         pte_t *pgtable;
56
57         pgtable = pte_alloc_one_kernel(&init_mm);
58         if (!pgtable)
59                 return -ENOMEM;
60
61         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
62
63         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
64                 pte_t entry, *pte;
65                 pgprot_t pgprot = PAGE_KERNEL;
66
67                 entry = mk_pte(head + i, pgprot);
68                 pte = pte_offset_kernel(&__pmd, addr);
69                 set_pte_at(&init_mm, addr, pte, entry);
70         }
71
72         spin_lock(&init_mm.page_table_lock);
73         if (likely(pmd_leaf(*pmd))) {
74                 /*
75                  * Higher order allocations from buddy allocator must be able to
76                  * be treated as indepdenent small pages (as they can be freed
77                  * individually).
78                  */
79                 if (!PageReserved(head))
80                         split_page(head, get_order(PMD_SIZE));
81
82                 /* Make pte visible before pmd. See comment in pmd_install(). */
83                 smp_wmb();
84                 pmd_populate_kernel(&init_mm, pmd, pgtable);
85                 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
86                         flush_tlb_kernel_range(start, start + PMD_SIZE);
87         } else {
88                 pte_free_kernel(&init_mm, pgtable);
89         }
90         spin_unlock(&init_mm.page_table_lock);
91
92         return 0;
93 }
94
95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
96                              unsigned long next, struct mm_walk *walk)
97 {
98         int ret = 0;
99         struct page *head;
100         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
101
102         /* Only splitting, not remapping the vmemmap pages. */
103         if (!vmemmap_walk->remap_pte)
104                 walk->action = ACTION_CONTINUE;
105
106         spin_lock(&init_mm.page_table_lock);
107         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
108         /*
109          * Due to HugeTLB alignment requirements and the vmemmap
110          * pages being at the start of the hotplugged memory
111          * region in memory_hotplug.memmap_on_memory case. Checking
112          * the vmemmap page associated with the first vmemmap page
113          * if it is self-hosted is sufficient.
114          *
115          * [                  hotplugged memory                  ]
116          * [        section        ][...][        section        ]
117          * [ vmemmap ][              usable memory               ]
118          *   ^  | ^                        |
119          *   +--+ |                        |
120          *        +------------------------+
121          */
122         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
123                 struct page *page = head ? head + pte_index(addr) :
124                                     pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
125
126                 if (PageVmemmapSelfHosted(page))
127                         ret = -ENOTSUPP;
128         }
129         spin_unlock(&init_mm.page_table_lock);
130         if (!head || ret)
131                 return ret;
132
133         return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
134 }
135
136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
137                              unsigned long next, struct mm_walk *walk)
138 {
139         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
140
141         /*
142          * The reuse_page is found 'first' in page table walking before
143          * starting remapping.
144          */
145         if (!vmemmap_walk->reuse_page)
146                 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
147         else
148                 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
149         vmemmap_walk->nr_walked++;
150
151         return 0;
152 }
153
154 static const struct mm_walk_ops vmemmap_remap_ops = {
155         .pmd_entry      = vmemmap_pmd_entry,
156         .pte_entry      = vmemmap_pte_entry,
157 };
158
159 static int vmemmap_remap_range(unsigned long start, unsigned long end,
160                                struct vmemmap_remap_walk *walk)
161 {
162         int ret;
163
164         VM_BUG_ON(!PAGE_ALIGNED(start | end));
165
166         mmap_read_lock(&init_mm);
167         ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
168                                     NULL, walk);
169         mmap_read_unlock(&init_mm);
170         if (ret)
171                 return ret;
172
173         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
174                 flush_tlb_kernel_range(start, end);
175
176         return 0;
177 }
178
179 /*
180  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
181  * allocator or buddy allocator. If the PG_reserved flag is set, it means
182  * that it allocated from the memblock allocator, just free it via the
183  * free_bootmem_page(). Otherwise, use __free_page().
184  */
185 static inline void free_vmemmap_page(struct page *page)
186 {
187         if (PageReserved(page))
188                 free_bootmem_page(page);
189         else
190                 __free_page(page);
191 }
192
193 /* Free a list of the vmemmap pages */
194 static void free_vmemmap_page_list(struct list_head *list)
195 {
196         struct page *page, *next;
197
198         list_for_each_entry_safe(page, next, list, lru)
199                 free_vmemmap_page(page);
200 }
201
202 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
203                               struct vmemmap_remap_walk *walk)
204 {
205         /*
206          * Remap the tail pages as read-only to catch illegal write operation
207          * to the tail pages.
208          */
209         pgprot_t pgprot = PAGE_KERNEL_RO;
210         struct page *page = pte_page(ptep_get(pte));
211         pte_t entry;
212
213         /* Remapping the head page requires r/w */
214         if (unlikely(addr == walk->reuse_addr)) {
215                 pgprot = PAGE_KERNEL;
216                 list_del(&walk->reuse_page->lru);
217
218                 /*
219                  * Makes sure that preceding stores to the page contents from
220                  * vmemmap_remap_free() become visible before the set_pte_at()
221                  * write.
222                  */
223                 smp_wmb();
224         }
225
226         entry = mk_pte(walk->reuse_page, pgprot);
227         list_add(&page->lru, walk->vmemmap_pages);
228         set_pte_at(&init_mm, addr, pte, entry);
229 }
230
231 /*
232  * How many struct page structs need to be reset. When we reuse the head
233  * struct page, the special metadata (e.g. page->flags or page->mapping)
234  * cannot copy to the tail struct page structs. The invalid value will be
235  * checked in the free_tail_page_prepare(). In order to avoid the message
236  * of "corrupted mapping in tail page". We need to reset at least 3 (one
237  * head struct page struct and two tail struct page structs) struct page
238  * structs.
239  */
240 #define NR_RESET_STRUCT_PAGE            3
241
242 static inline void reset_struct_pages(struct page *start)
243 {
244         struct page *from = start + NR_RESET_STRUCT_PAGE;
245
246         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
247         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
248 }
249
250 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
251                                 struct vmemmap_remap_walk *walk)
252 {
253         pgprot_t pgprot = PAGE_KERNEL;
254         struct page *page;
255         void *to;
256
257         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
258
259         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
260         list_del(&page->lru);
261         to = page_to_virt(page);
262         copy_page(to, (void *)walk->reuse_addr);
263         reset_struct_pages(to);
264
265         /*
266          * Makes sure that preceding stores to the page contents become visible
267          * before the set_pte_at() write.
268          */
269         smp_wmb();
270         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
271 }
272
273 /**
274  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
275  *                      backing PMDs of the directmap into PTEs
276  * @start:     start address of the vmemmap virtual address range that we want
277  *             to remap.
278  * @end:       end address of the vmemmap virtual address range that we want to
279  *             remap.
280  * @reuse:     reuse address.
281  *
282  * Return: %0 on success, negative error code otherwise.
283  */
284 static int vmemmap_remap_split(unsigned long start, unsigned long end,
285                                unsigned long reuse)
286 {
287         struct vmemmap_remap_walk walk = {
288                 .remap_pte      = NULL,
289                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
290         };
291
292         /* See the comment in the vmemmap_remap_free(). */
293         BUG_ON(start - reuse != PAGE_SIZE);
294
295         return vmemmap_remap_range(reuse, end, &walk);
296 }
297
298 /**
299  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
300  *                      to the page which @reuse is mapped to, then free vmemmap
301  *                      which the range are mapped to.
302  * @start:      start address of the vmemmap virtual address range that we want
303  *              to remap.
304  * @end:        end address of the vmemmap virtual address range that we want to
305  *              remap.
306  * @reuse:      reuse address.
307  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
308  *              responsibility to free pages.
309  * @flags:      modifications to vmemmap_remap_walk flags
310  *
311  * Return: %0 on success, negative error code otherwise.
312  */
313 static int vmemmap_remap_free(unsigned long start, unsigned long end,
314                               unsigned long reuse,
315                               struct list_head *vmemmap_pages,
316                               unsigned long flags)
317 {
318         int ret;
319         struct vmemmap_remap_walk walk = {
320                 .remap_pte      = vmemmap_remap_pte,
321                 .reuse_addr     = reuse,
322                 .vmemmap_pages  = vmemmap_pages,
323                 .flags          = flags,
324         };
325         int nid = page_to_nid((struct page *)reuse);
326         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
327
328         /*
329          * Allocate a new head vmemmap page to avoid breaking a contiguous
330          * block of struct page memory when freeing it back to page allocator
331          * in free_vmemmap_page_list(). This will allow the likely contiguous
332          * struct page backing memory to be kept contiguous and allowing for
333          * more allocations of hugepages. Fallback to the currently
334          * mapped head page in case should it fail to allocate.
335          */
336         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
337         if (walk.reuse_page) {
338                 copy_page(page_to_virt(walk.reuse_page),
339                           (void *)walk.reuse_addr);
340                 list_add(&walk.reuse_page->lru, vmemmap_pages);
341         }
342
343         /*
344          * In order to make remapping routine most efficient for the huge pages,
345          * the routine of vmemmap page table walking has the following rules
346          * (see more details from the vmemmap_pte_range()):
347          *
348          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
349          *   should be continuous.
350          * - The @reuse address is part of the range [@reuse, @end) that we are
351          *   walking which is passed to vmemmap_remap_range().
352          * - The @reuse address is the first in the complete range.
353          *
354          * So we need to make sure that @start and @reuse meet the above rules.
355          */
356         BUG_ON(start - reuse != PAGE_SIZE);
357
358         ret = vmemmap_remap_range(reuse, end, &walk);
359         if (ret && walk.nr_walked) {
360                 end = reuse + walk.nr_walked * PAGE_SIZE;
361                 /*
362                  * vmemmap_pages contains pages from the previous
363                  * vmemmap_remap_range call which failed.  These
364                  * are pages which were removed from the vmemmap.
365                  * They will be restored in the following call.
366                  */
367                 walk = (struct vmemmap_remap_walk) {
368                         .remap_pte      = vmemmap_restore_pte,
369                         .reuse_addr     = reuse,
370                         .vmemmap_pages  = vmemmap_pages,
371                         .flags          = 0,
372                 };
373
374                 vmemmap_remap_range(reuse, end, &walk);
375         }
376
377         return ret;
378 }
379
380 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
381                                    struct list_head *list)
382 {
383         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
384         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
385         int nid = page_to_nid((struct page *)start);
386         struct page *page, *next;
387
388         while (nr_pages--) {
389                 page = alloc_pages_node(nid, gfp_mask, 0);
390                 if (!page)
391                         goto out;
392                 list_add(&page->lru, list);
393         }
394
395         return 0;
396 out:
397         list_for_each_entry_safe(page, next, list, lru)
398                 __free_page(page);
399         return -ENOMEM;
400 }
401
402 /**
403  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
404  *                       to the page which is from the @vmemmap_pages
405  *                       respectively.
406  * @start:      start address of the vmemmap virtual address range that we want
407  *              to remap.
408  * @end:        end address of the vmemmap virtual address range that we want to
409  *              remap.
410  * @reuse:      reuse address.
411  * @flags:      modifications to vmemmap_remap_walk flags
412  *
413  * Return: %0 on success, negative error code otherwise.
414  */
415 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
416                                unsigned long reuse, unsigned long flags)
417 {
418         LIST_HEAD(vmemmap_pages);
419         struct vmemmap_remap_walk walk = {
420                 .remap_pte      = vmemmap_restore_pte,
421                 .reuse_addr     = reuse,
422                 .vmemmap_pages  = &vmemmap_pages,
423                 .flags          = flags,
424         };
425
426         /* See the comment in the vmemmap_remap_free(). */
427         BUG_ON(start - reuse != PAGE_SIZE);
428
429         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
430                 return -ENOMEM;
431
432         return vmemmap_remap_range(reuse, end, &walk);
433 }
434
435 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
436 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
437
438 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
439 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
440
441 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
442                                            struct folio *folio, unsigned long flags)
443 {
444         int ret;
445         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
446         unsigned long vmemmap_reuse;
447
448         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
449         if (!folio_test_hugetlb_vmemmap_optimized(folio))
450                 return 0;
451
452         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
453         vmemmap_reuse   = vmemmap_start;
454         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
455
456         /*
457          * The pages which the vmemmap virtual address range [@vmemmap_start,
458          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
459          * the range is mapped to the page which @vmemmap_reuse is mapped to.
460          * When a HugeTLB page is freed to the buddy allocator, previously
461          * discarded vmemmap pages must be allocated and remapping.
462          */
463         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
464         if (!ret) {
465                 folio_clear_hugetlb_vmemmap_optimized(folio);
466                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
467         }
468
469         return ret;
470 }
471
472 /**
473  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
474  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
475  *                              will be reallocated and remapped.
476  * @h:          struct hstate.
477  * @folio:     the folio whose vmemmap pages will be restored.
478  *
479  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
480  * negative error code otherwise.
481  */
482 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
483 {
484         return __hugetlb_vmemmap_restore_folio(h, folio, 0);
485 }
486
487 /**
488  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
489  * @h:                  hstate.
490  * @folio_list:         list of folios.
491  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
492  *
493  * Return: number of folios for which vmemmap was restored, or an error code
494  *              if an error was encountered restoring vmemmap for a folio.
495  *              Folios that have vmemmap are moved to the non_hvo_folios
496  *              list.  Processing of entries stops when the first error is
497  *              encountered. The folio that experienced the error and all
498  *              non-processed folios will remain on folio_list.
499  */
500 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
501                                         struct list_head *folio_list,
502                                         struct list_head *non_hvo_folios)
503 {
504         struct folio *folio, *t_folio;
505         long restored = 0;
506         long ret = 0;
507
508         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
509                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
510                         ret = __hugetlb_vmemmap_restore_folio(h, folio,
511                                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
512                         if (ret)
513                                 break;
514                         restored++;
515                 }
516
517                 /* Add non-optimized folios to output list */
518                 list_move(&folio->lru, non_hvo_folios);
519         }
520
521         if (restored)
522                 flush_tlb_all();
523         if (!ret)
524                 ret = restored;
525         return ret;
526 }
527
528 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
529 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
530 {
531         if (folio_test_hugetlb_vmemmap_optimized(folio))
532                 return false;
533
534         if (!READ_ONCE(vmemmap_optimize_enabled))
535                 return false;
536
537         if (!hugetlb_vmemmap_optimizable(h))
538                 return false;
539
540         return true;
541 }
542
543 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
544                                             struct folio *folio,
545                                             struct list_head *vmemmap_pages,
546                                             unsigned long flags)
547 {
548         int ret = 0;
549         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
550         unsigned long vmemmap_reuse;
551
552         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
553         if (!vmemmap_should_optimize_folio(h, folio))
554                 return ret;
555
556         static_branch_inc(&hugetlb_optimize_vmemmap_key);
557         /*
558          * Very Subtle
559          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
560          * immediately after remapping.  As a result, subsequent accesses
561          * and modifications to struct pages associated with the hugetlb
562          * page could be to the OLD struct pages.  Set the vmemmap optimized
563          * flag here so that it is copied to the new head page.  This keeps
564          * the old and new struct pages in sync.
565          * If there is an error during optimization, we will immediately FLUSH
566          * the TLB and clear the flag below.
567          */
568         folio_set_hugetlb_vmemmap_optimized(folio);
569
570         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
571         vmemmap_reuse   = vmemmap_start;
572         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
573
574         /*
575          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
576          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
577          * mapping the range to vmemmap_pages list so that they can be freed by
578          * the caller.
579          */
580         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
581                                  vmemmap_pages, flags);
582         if (ret) {
583                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
584                 folio_clear_hugetlb_vmemmap_optimized(folio);
585         }
586
587         return ret;
588 }
589
590 /**
591  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
592  * @h:          struct hstate.
593  * @folio:     the folio whose vmemmap pages will be optimized.
594  *
595  * This function only tries to optimize @folio's vmemmap pages and does not
596  * guarantee that the optimization will succeed after it returns. The caller
597  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
598  * vmemmap pages have been optimized.
599  */
600 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
601 {
602         LIST_HEAD(vmemmap_pages);
603
604         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
605         free_vmemmap_page_list(&vmemmap_pages);
606 }
607
608 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
609 {
610         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
611         unsigned long vmemmap_reuse;
612
613         if (!vmemmap_should_optimize_folio(h, folio))
614                 return 0;
615
616         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
617         vmemmap_reuse   = vmemmap_start;
618         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
619
620         /*
621          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
622          * @vmemmap_end]
623          */
624         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
625 }
626
627 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
628 {
629         struct folio *folio;
630         LIST_HEAD(vmemmap_pages);
631
632         list_for_each_entry(folio, folio_list, lru) {
633                 int ret = hugetlb_vmemmap_split_folio(h, folio);
634
635                 /*
636                  * Spliting the PMD requires allocating a page, thus lets fail
637                  * early once we encounter the first OOM. No point in retrying
638                  * as it can be dynamically done on remap with the memory
639                  * we get back from the vmemmap deduplication.
640                  */
641                 if (ret == -ENOMEM)
642                         break;
643         }
644
645         flush_tlb_all();
646
647         list_for_each_entry(folio, folio_list, lru) {
648                 int ret;
649
650                 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
651                                                        VMEMMAP_REMAP_NO_TLB_FLUSH);
652
653                 /*
654                  * Pages to be freed may have been accumulated.  If we
655                  * encounter an ENOMEM,  free what we have and try again.
656                  * This can occur in the case that both spliting fails
657                  * halfway and head page allocation also failed. In this
658                  * case __hugetlb_vmemmap_optimize_folio() would free memory
659                  * allowing more vmemmap remaps to occur.
660                  */
661                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
662                         flush_tlb_all();
663                         free_vmemmap_page_list(&vmemmap_pages);
664                         INIT_LIST_HEAD(&vmemmap_pages);
665                         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
666                                                          VMEMMAP_REMAP_NO_TLB_FLUSH);
667                 }
668         }
669
670         flush_tlb_all();
671         free_vmemmap_page_list(&vmemmap_pages);
672 }
673
674 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
675         {
676                 .procname       = "hugetlb_optimize_vmemmap",
677                 .data           = &vmemmap_optimize_enabled,
678                 .maxlen         = sizeof(vmemmap_optimize_enabled),
679                 .mode           = 0644,
680                 .proc_handler   = proc_dobool,
681         },
682         { }
683 };
684
685 static int __init hugetlb_vmemmap_init(void)
686 {
687         const struct hstate *h;
688
689         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
690         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
691
692         for_each_hstate(h) {
693                 if (hugetlb_vmemmap_optimizable(h)) {
694                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
695                         break;
696                 }
697         }
698         return 0;
699 }
700 late_initcall(hugetlb_vmemmap_init);