fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs
authorMuhammad Usama Anjum <usama.anjum@collabora.com>
Mon, 21 Aug 2023 14:15:14 +0000 (19:15 +0500)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 18 Oct 2023 21:34:12 +0000 (14:34 -0700)
The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
  criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
  the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
  non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
  can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
  we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/proc/task_mmu.c
include/linux/hugetlb.h
include/linux/userfaultfd_k.h
include/uapi/linux/fs.h
mm/hugetlb.c

index 3dd5be96691b4cc234454353ec1b8a324760e0c2..d4ef9a2bf95dbb9f1ef04d0262aa488fff42c985 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file)
        return 0;
 }
 
+#define PM_SCAN_CATEGORIES     (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |  \
+                                PAGE_IS_FILE | PAGE_IS_PRESENT |       \
+                                PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |    \
+                                PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS          (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+       struct pm_scan_arg arg;
+       unsigned long masks_of_interest, cur_vma_category;
+       struct page_region *vec_buf;
+       unsigned long vec_buf_len, vec_buf_index, found_pages;
+       struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+                                          struct vm_area_struct *vma,
+                                          unsigned long addr, pte_t pte)
+{
+       unsigned long categories = 0;
+
+       if (pte_present(pte)) {
+               struct page *page;
+
+               categories |= PAGE_IS_PRESENT;
+               if (!pte_uffd_wp(pte))
+                       categories |= PAGE_IS_WRITTEN;
+
+               if (p->masks_of_interest & PAGE_IS_FILE) {
+                       page = vm_normal_page(vma, addr, pte);
+                       if (page && !PageAnon(page))
+                               categories |= PAGE_IS_FILE;
+               }
+
+               if (is_zero_pfn(pte_pfn(pte)))
+                       categories |= PAGE_IS_PFNZERO;
+       } else if (is_swap_pte(pte)) {
+               swp_entry_t swp;
+
+               categories |= PAGE_IS_SWAPPED;
+               if (!pte_swp_uffd_wp_any(pte))
+                       categories |= PAGE_IS_WRITTEN;
+
+               if (p->masks_of_interest & PAGE_IS_FILE) {
+                       swp = pte_to_swp_entry(pte);
+                       if (is_pfn_swap_entry(swp) &&
+                           !PageAnon(pfn_swap_entry_to_page(swp)))
+                               categories |= PAGE_IS_FILE;
+               }
+       }
+
+       return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+                            unsigned long addr, pte_t *pte)
+{
+       pte_t ptent = ptep_get(pte);
+
+       if (pte_present(ptent)) {
+               pte_t old_pte;
+
+               old_pte = ptep_modify_prot_start(vma, addr, pte);
+               ptent = pte_mkuffd_wp(ptent);
+               ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+       } else if (is_swap_pte(ptent)) {
+               ptent = pte_swp_mkuffd_wp(ptent);
+               set_pte_at(vma->vm_mm, addr, pte, ptent);
+       } else {
+               set_pte_at(vma->vm_mm, addr, pte,
+                          make_pte_marker(PTE_MARKER_UFFD_WP));
+       }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+                                         struct vm_area_struct *vma,
+                                         unsigned long addr, pmd_t pmd)
+{
+       unsigned long categories = PAGE_IS_HUGE;
+
+       if (pmd_present(pmd)) {
+               struct page *page;
+
+               categories |= PAGE_IS_PRESENT;
+               if (!pmd_uffd_wp(pmd))
+                       categories |= PAGE_IS_WRITTEN;
+
+               if (p->masks_of_interest & PAGE_IS_FILE) {
+                       page = vm_normal_page_pmd(vma, addr, pmd);
+                       if (page && !PageAnon(page))
+                               categories |= PAGE_IS_FILE;
+               }
+
+               if (is_zero_pfn(pmd_pfn(pmd)))
+                       categories |= PAGE_IS_PFNZERO;
+       } else if (is_swap_pmd(pmd)) {
+               swp_entry_t swp;
+
+               categories |= PAGE_IS_SWAPPED;
+               if (!pmd_swp_uffd_wp(pmd))
+                       categories |= PAGE_IS_WRITTEN;
+
+               if (p->masks_of_interest & PAGE_IS_FILE) {
+                       swp = pmd_to_swp_entry(pmd);
+                       if (is_pfn_swap_entry(swp) &&
+                           !PageAnon(pfn_swap_entry_to_page(swp)))
+                               categories |= PAGE_IS_FILE;
+               }
+       }
+
+       return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+                            unsigned long addr, pmd_t *pmdp)
+{
+       pmd_t old, pmd = *pmdp;
+
+       if (pmd_present(pmd)) {
+               old = pmdp_invalidate_ad(vma, addr, pmdp);
+               pmd = pmd_mkuffd_wp(old);
+               set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+       } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+               pmd = pmd_swp_mkuffd_wp(pmd);
+               set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+       }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+       unsigned long categories = PAGE_IS_HUGE;
+
+       /*
+        * According to pagemap_hugetlb_range(), file-backed HugeTLB
+        * page cannot be swapped. So PAGE_IS_FILE is not checked for
+        * swapped pages.
+        */
+       if (pte_present(pte)) {
+               categories |= PAGE_IS_PRESENT;
+               if (!huge_pte_uffd_wp(pte))
+                       categories |= PAGE_IS_WRITTEN;
+               if (!PageAnon(pte_page(pte)))
+                       categories |= PAGE_IS_FILE;
+               if (is_zero_pfn(pte_pfn(pte)))
+                       categories |= PAGE_IS_PFNZERO;
+       } else if (is_swap_pte(pte)) {
+               categories |= PAGE_IS_SWAPPED;
+               if (!pte_swp_uffd_wp_any(pte))
+                       categories |= PAGE_IS_WRITTEN;
+       }
+
+       return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+                                 unsigned long addr, pte_t *ptep,
+                                 pte_t ptent)
+{
+       unsigned long psize;
+
+       if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+               return;
+
+       psize = huge_page_size(hstate_vma(vma));
+
+       if (is_hugetlb_entry_migration(ptent))
+               set_huge_pte_at(vma->vm_mm, addr, ptep,
+                               pte_swp_mkuffd_wp(ptent), psize);
+       else if (!huge_pte_none(ptent))
+               huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+                                            huge_pte_mkuffd_wp(ptent));
+       else
+               set_huge_pte_at(vma->vm_mm, addr, ptep,
+                               make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+                                      unsigned long addr, unsigned long end)
+{
+       struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+       if (cur_buf->start != addr)
+               cur_buf->end = addr;
+       else
+               cur_buf->start = cur_buf->end = 0;
+
+       p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+                                            const struct pagemap_scan_private *p)
+{
+       categories ^= p->arg.category_inverted;
+       if ((categories & p->arg.category_mask) != p->arg.category_mask)
+               return false;
+       if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+               return false;
+
+       return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+                                           const struct pagemap_scan_private *p)
+{
+       unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+       categories ^= p->arg.category_inverted;
+       if ((categories & required) != required)
+               return false;
+
+       return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+                                 struct mm_walk *walk)
+{
+       struct pagemap_scan_private *p = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       unsigned long vma_category = 0;
+
+       if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
+               vma_category |= PAGE_IS_WPALLOWED;
+       else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+               return -EPERM;
+
+       if (vma->vm_flags & VM_PFNMAP)
+               return 1;
+
+       if (!pagemap_scan_is_interesting_vma(vma_category, p))
+               return 1;
+
+       p->cur_vma_category = vma_category;
+
+       return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+                                   struct pagemap_scan_private *p,
+                                   unsigned long addr, unsigned long end)
+{
+       struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+       /*
+        * When there is no output buffer provided at all, the sentinel values
+        * won't match here. There is no other way for `cur_buf->end` to be
+        * non-zero other than it being non-empty.
+        */
+       if (addr == cur_buf->end && categories == cur_buf->categories) {
+               cur_buf->end = end;
+               return true;
+       }
+
+       if (cur_buf->end) {
+               if (p->vec_buf_index >= p->vec_buf_len - 1)
+                       return false;
+
+               cur_buf = &p->vec_buf[++p->vec_buf_index];
+       }
+
+       cur_buf->start = addr;
+       cur_buf->end = end;
+       cur_buf->categories = categories;
+
+       return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+                              struct pagemap_scan_private *p,
+                              unsigned long addr, unsigned long *end)
+{
+       unsigned long n_pages, total_pages;
+       int ret = 0;
+
+       if (!p->vec_buf)
+               return 0;
+
+       categories &= p->arg.return_mask;
+
+       n_pages = (*end - addr) / PAGE_SIZE;
+       if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+           total_pages > p->arg.max_pages) {
+               size_t n_too_much = total_pages - p->arg.max_pages;
+               *end -= n_too_much * PAGE_SIZE;
+               n_pages -= n_too_much;
+               ret = -ENOSPC;
+       }
+
+       if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+               *end = addr;
+               n_pages = 0;
+               ret = -ENOSPC;
+       }
+
+       p->found_pages += n_pages;
+       if (ret)
+               p->arg.walk_end = *end;
+
+       return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+                                 unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       struct pagemap_scan_private *p = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       unsigned long categories;
+       spinlock_t *ptl;
+       int ret = 0;
+
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (!ptl)
+               return -ENOENT;
+
+       categories = p->cur_vma_category |
+                    pagemap_thp_category(p, vma, start, *pmd);
+
+       if (!pagemap_scan_is_interesting_page(categories, p))
+               goto out_unlock;
+
+       ret = pagemap_scan_output(categories, p, start, &end);
+       if (start == end)
+               goto out_unlock;
+
+       if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+               goto out_unlock;
+       if (~categories & PAGE_IS_WRITTEN)
+               goto out_unlock;
+
+       /*
+        * Break huge page into small pages if the WP operation
+        * needs to be performed on a portion of the huge page.
+        */
+       if (end != start + HPAGE_SIZE) {
+               spin_unlock(ptl);
+               split_huge_pmd(vma, pmd, start);
+               pagemap_scan_backout_range(p, start, end);
+               /* Report as if there was no THP */
+               return -ENOENT;
+       }
+
+       make_uffd_wp_pmd(vma, start, pmd);
+       flush_tlb_range(vma, start, end);
+out_unlock:
+       spin_unlock(ptl);
+       return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+       return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+                                 unsigned long end, struct mm_walk *walk)
+{
+       struct pagemap_scan_private *p = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       unsigned long addr, flush_end = 0;
+       pte_t *pte, *start_pte;
+       spinlock_t *ptl;
+       int ret;
+
+       arch_enter_lazy_mmu_mode();
+
+       ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+       if (ret != -ENOENT) {
+               arch_leave_lazy_mmu_mode();
+               return ret;
+       }
+
+       ret = 0;
+       start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+       if (!pte) {
+               arch_leave_lazy_mmu_mode();
+               walk->action = ACTION_AGAIN;
+               return 0;
+       }
+
+       for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+               unsigned long categories = p->cur_vma_category |
+                                          pagemap_page_category(p, vma, addr, ptep_get(pte));
+               unsigned long next = addr + PAGE_SIZE;
+
+               if (!pagemap_scan_is_interesting_page(categories, p))
+                       continue;
+
+               ret = pagemap_scan_output(categories, p, addr, &next);
+               if (next == addr)
+                       break;
+
+               if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+                       continue;
+               if (~categories & PAGE_IS_WRITTEN)
+                       continue;
+
+               make_uffd_wp_pte(vma, addr, pte);
+               if (!flush_end)
+                       start = addr;
+               flush_end = next;
+       }
+
+       if (flush_end)
+               flush_tlb_range(vma, start, addr);
+
+       pte_unmap_unlock(start_pte, ptl);
+       arch_leave_lazy_mmu_mode();
+
+       cond_resched();
+       return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+                                     unsigned long start, unsigned long end,
+                                     struct mm_walk *walk)
+{
+       struct pagemap_scan_private *p = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       unsigned long categories;
+       spinlock_t *ptl;
+       int ret = 0;
+       pte_t pte;
+
+       if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+               /* Go the short route when not write-protecting pages. */
+
+               pte = huge_ptep_get(ptep);
+               categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+               if (!pagemap_scan_is_interesting_page(categories, p))
+                       return 0;
+
+               return pagemap_scan_output(categories, p, start, &end);
+       }
+
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+       pte = huge_ptep_get(ptep);
+       categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+       if (!pagemap_scan_is_interesting_page(categories, p))
+               goto out_unlock;
+
+       ret = pagemap_scan_output(categories, p, start, &end);
+       if (start == end)
+               goto out_unlock;
+
+       if (~categories & PAGE_IS_WRITTEN)
+               goto out_unlock;
+
+       if (end != start + HPAGE_SIZE) {
+               /* Partial HugeTLB page WP isn't possible. */
+               pagemap_scan_backout_range(p, start, end);
+               p->arg.walk_end = start;
+               ret = 0;
+               goto out_unlock;
+       }
+
+       make_uffd_wp_huge_pte(vma, start, ptep, pte);
+       flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+       spin_unlock(ptl);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+       return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+                                int depth, struct mm_walk *walk)
+{
+       struct pagemap_scan_private *p = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       int ret, err;
+
+       if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+               return 0;
+
+       ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+       if (addr == end)
+               return ret;
+
+       if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+               return ret;
+
+       err = uffd_wp_range(vma, addr, end - addr, true);
+       if (err < 0)
+               ret = err;
+
+       return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+       .test_walk = pagemap_scan_test_walk,
+       .pmd_entry = pagemap_scan_pmd_entry,
+       .pte_hole = pagemap_scan_pte_hole,
+       .hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+                                unsigned long uarg)
+{
+       if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+               return -EFAULT;
+
+       if (arg->size != sizeof(struct pm_scan_arg))
+               return -EINVAL;
+
+       /* Validate requested features */
+       if (arg->flags & ~PM_SCAN_FLAGS)
+               return -EINVAL;
+       if ((arg->category_inverted | arg->category_mask |
+            arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+               return -EINVAL;
+
+       arg->start = untagged_addr((unsigned long)arg->start);
+       arg->end = untagged_addr((unsigned long)arg->end);
+       arg->vec = untagged_addr((unsigned long)arg->vec);
+
+       /* Validate memory pointers */
+       if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+               return -EINVAL;
+       if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+               return -EFAULT;
+       if (!arg->vec && arg->vec_len)
+               return -EINVAL;
+       if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+                             arg->vec_len * sizeof(struct page_region)))
+               return -EFAULT;
+
+       /* Fixup default values */
+       arg->end = ALIGN(arg->end, PAGE_SIZE);
+       arg->walk_end = 0;
+       if (!arg->max_pages)
+               arg->max_pages = ULONG_MAX;
+
+       return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+                                      unsigned long uargl)
+{
+       struct pm_scan_arg __user *uarg = (void __user *)uargl;
+
+       if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+       if (!p->arg.vec_len)
+               return 0;
+
+       p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+                              p->arg.vec_len);
+       p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+                                  GFP_KERNEL);
+       if (!p->vec_buf)
+               return -ENOMEM;
+
+       p->vec_buf->start = p->vec_buf->end = 0;
+       p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+       return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+       const struct page_region *buf = p->vec_buf;
+       long n = p->vec_buf_index;
+
+       if (!p->vec_buf)
+               return 0;
+
+       if (buf[n].end != buf[n].start)
+               n++;
+
+       if (!n)
+               return 0;
+
+       if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+               return -EFAULT;
+
+       p->arg.vec_len -= n;
+       p->vec_out += n;
+
+       p->vec_buf_index = 0;
+       p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+       p->vec_buf->start = p->vec_buf->end = 0;
+
+       return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+       struct mmu_notifier_range range;
+       struct pagemap_scan_private p = {0};
+       unsigned long walk_start;
+       size_t n_ranges_out = 0;
+       int ret;
+
+       ret = pagemap_scan_get_args(&p.arg, uarg);
+       if (ret)
+               return ret;
+
+       p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+                             p.arg.return_mask;
+       ret = pagemap_scan_init_bounce_buffer(&p);
+       if (ret)
+               return ret;
+
+       /* Protection change for the range is going to happen. */
+       if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+               mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+                                       mm, p.arg.start, p.arg.end);
+               mmu_notifier_invalidate_range_start(&range);
+       }
+
+       for (walk_start = p.arg.start; walk_start < p.arg.end;
+                       walk_start = p.arg.walk_end) {
+               long n_out;
+
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       break;
+               }
+
+               ret = mmap_read_lock_killable(mm);
+               if (ret)
+                       break;
+               ret = walk_page_range(mm, walk_start, p.arg.end,
+                                     &pagemap_scan_ops, &p);
+               mmap_read_unlock(mm);
+
+               n_out = pagemap_scan_flush_buffer(&p);
+               if (n_out < 0)
+                       ret = n_out;
+               else
+                       n_ranges_out += n_out;
+
+               if (ret != -ENOSPC)
+                       break;
+
+               if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+                       break;
+       }
+
+       /* ENOSPC signifies early stop (buffer full) from the walk. */
+       if (!ret || ret == -ENOSPC)
+               ret = n_ranges_out;
+
+       /* The walk_end isn't set when ret is zero */
+       if (!p.arg.walk_end)
+               p.arg.walk_end = p.arg.end;
+       if (pagemap_scan_writeback_args(&p.arg, uarg))
+               ret = -EFAULT;
+
+       if (p.arg.flags & PM_SCAN_WP_MATCHING)
+               mmu_notifier_invalidate_range_end(&range);
+
+       kfree(p.vec_buf);
+       return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+                          unsigned long arg)
+{
+       struct mm_struct *mm = file->private_data;
+
+       switch (cmd) {
+       case PAGEMAP_SCAN:
+               return do_pagemap_scan(mm, arg);
+
+       default:
+               return -EINVAL;
+       }
+}
+
 const struct file_operations proc_pagemap_operations = {
        .llseek         = mem_lseek, /* borrow this */
        .read           = pagemap_read,
        .open           = pagemap_open,
        .release        = pagemap_release,
+       .unlocked_ioctl = do_pagemap_cmd,
+       .compat_ioctl   = do_pagemap_cmd,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
index e5b9f7e62eeb7fc40874e89d2ab7026c7a5a96b0..205469aa061348c69d0b74c9067307bf446e39b0 100644 (file)
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long cp_flags);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
index c98df391bfd8f5437c9a435bc93a301f31209205..f2dc19f40d0596c74d877616c16dd9c242eac2df 100644 (file)
@@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
        return VM_FAULT_SIGBUS;
 }
 
+static inline long uffd_wp_range(struct vm_area_struct *vma,
+                                unsigned long start, unsigned long len,
+                                bool enable_wp)
+{
+       return false;
+}
+
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
                                        struct vm_userfaultfd_ctx vm_ctx)
 {
index b7b56871029c58148cace2e383249cd193e29151..da43810b74856b5c9d34d57ecd10058732801921 100644 (file)
@@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED  (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
                         RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN   _IOWR('f', 16, struct pm_scan_arg)
+
+/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
+#define PAGE_IS_WPALLOWED      (1 << 0)
+#define PAGE_IS_WRITTEN                (1 << 1)
+#define PAGE_IS_FILE           (1 << 2)
+#define PAGE_IS_PRESENT                (1 << 3)
+#define PAGE_IS_SWAPPED                (1 << 4)
+#define PAGE_IS_PFNZERO                (1 << 5)
+#define PAGE_IS_HUGE           (1 << 6)
+
+/*
+ * struct page_region - Page region with flags
+ * @start:     Start of the region
+ * @end:       End of the region (exclusive)
+ * @categories:        PAGE_IS_* category bitmask for the region
+ */
+struct page_region {
+       __u64 start;
+       __u64 end;
+       __u64 categories;
+};
+
+/* Flags for PAGEMAP_SCAN ioctl */
+#define PM_SCAN_WP_MATCHING    (1 << 0)        /* Write protect the pages matched. */
+#define PM_SCAN_CHECK_WPASYNC  (1 << 1)        /* Abort the scan when a non-WP-enabled page is found. */
+
+/*
+ * struct pm_scan_arg - Pagemap ioctl argument
+ * @size:              Size of the structure
+ * @flags:             Flags for the IOCTL
+ * @start:             Starting address of the region
+ * @end:               Ending address of the region
+ * @walk_end           Address where the scan stopped (written by kernel).
+ *                     walk_end == end (address tags cleared) informs that the scan completed on entire range.
+ * @vec:               Address of page_region struct array for output
+ * @vec_len:           Length of the page_region struct array
+ * @max_pages:         Optional limit for number of returned pages (0 = disabled)
+ * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
+ * @category_mask:     Skip pages for which any category doesn't match
+ * @category_anyof_mask: Skip pages for which no category matches
+ * @return_mask:       PAGE_IS_* categories that are to be reported in `page_region`s returned
+ */
+struct pm_scan_arg {
+       __u64 size;
+       __u64 flags;
+       __u64 start;
+       __u64 end;
+       __u64 walk_end;
+       __u64 vec;
+       __u64 vec_len;
+       __u64 max_pages;
+       __u64 category_inverted;
+       __u64 category_mask;
+       __u64 category_anyof_mask;
+       __u64 return_mask;
+};
+
 #endif /* _UAPI_LINUX_FS_H */
index bc654b36df9f97ad768d6a661f4b1be3e9876520..2878e0e6bac5c8bcf9a6b1df3efd31f9b677a3e7 100644 (file)
@@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
                return false;
 }
 
-static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
+bool is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
        swp_entry_t swp;
 
@@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
 
                entry = huge_pte_clear_uffd_wp(entry);
-               set_huge_pte_at(mm, haddr, ptep, entry);
+               set_huge_pte_at(mm, haddr, ptep, entry,
+                               huge_page_size(hstate_vma(vma)));
                /* Fallthrough to CoW */
        }