mm/pgtable: allow pte_offset_map[_lock]() to fail
authorHugh Dickins <hughd@google.com>
Fri, 9 Jun 2023 01:10:32 +0000 (18:10 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Mon, 19 Jun 2023 23:19:12 +0000 (16:19 -0700)
Make pte_offset_map() a wrapper for __pte_offset_map() (optionally outputs
pmdval), pte_offset_map_lock() a sparse __cond_lock wrapper for
__pte_offset_map_lock(): those __funcs added in mm/pgtable-generic.c.

__pte_offset_map() do pmdval validation (including pmd_clear_bad() when
pmd_bad()), returning NULL if pmdval is not for a page table.
__pte_offset_map_lock() verify pmdval unchanged after getting the lock,
trying again if it changed.

No #ifdef CONFIG_TRANSPARENT_HUGEPAGE around them: that could be done to
cover the imminent case, but we expect to generalize it later, and it
makes a mess of where to do the pmd_bad() clearing.

Add pte_offset_map_nolock(): outputs ptl like pte_offset_map_lock(),
without actually taking the lock.  This will be preferred to open uses of
pte_lockptr(), because (when split ptlock is in page table's struct page)
it points to the right lock for the returned pte pointer, even if *pmd
gets changed racily afterwards.

Update corresponding Documentation.

Do not add the anticipated rcu_read_lock() and rcu_read_unlock()s yet:
they have to wait until all architectures are balancing pte_offset_map()s
with pte_unmap()s (as in the arch series posted earlier).  But comment
where they will go, so that it's easy to add them for experiments.  And
only when those are in place can transient racy failure cases be enabled.
Add more safety for the PAE mismatched pmd_low pmd_high case at that time.

Link: https://lkml.kernel.org/r/2929bfd-9893-a374-e463-4c3127ff9b9d@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zack Rusin <zackr@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/mm/split_page_table_lock.rst
include/linux/mm.h
include/linux/pgtable.h
mm/pgtable-generic.c

index 50ee0dfc95be6ef3c298d7ddc1544870546f158e..a834fad9de12096769229d43d61118875781e24b 100644 (file)
@@ -14,15 +14,20 @@ tables. Access to higher level tables protected by mm->page_table_lock.
 There are helpers to lock/unlock a table and other accessor functions:
 
  - pte_offset_map_lock()
-       maps pte and takes PTE table lock, returns pointer to the taken
-       lock;
+       maps PTE and takes PTE table lock, returns pointer to PTE with
+       pointer to its PTE table lock, or returns NULL if no PTE table;
+ - pte_offset_map_nolock()
+       maps PTE, returns pointer to PTE with pointer to its PTE table
+       lock (not taken), or returns NULL if no PTE table;
+ - pte_offset_map()
+       maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
+ - pte_unmap()
+       unmaps PTE table;
  - pte_unmap_unlock()
        unlocks and unmaps PTE table;
  - pte_alloc_map_lock()
-       allocates PTE table if needed and take the lock, returns pointer
-       to taken lock or NULL if allocation failed;
- - pte_lockptr()
-       returns pointer to PTE table lock;
+       allocates PTE table if needed and takes its lock, returns pointer to
+       PTE with pointer to its lock, or returns NULL if allocation failed;
  - pmd_lock()
        takes PMD table lock, returns pointer to taken lock;
  - pmd_lockptr()
index 66032f0d515cf9c1a68cfcaee6bb476196bd3944..a08dc8cc48fb50a0378698f777fcf8f8c5d9cb7d 100644 (file)
@@ -2827,14 +2827,25 @@ static inline void pgtable_pte_page_dtor(struct page *page)
        dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
-#define pte_offset_map_lock(mm, pmd, address, ptlp)    \
-({                                                     \
-       spinlock_t *__ptl = pte_lockptr(mm, pmd);       \
-       pte_t *__pte = pte_offset_map(pmd, address);    \
-       *(ptlp) = __ptl;                                \
-       spin_lock(__ptl);                               \
-       __pte;                                          \
-})
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
+static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
+{
+       return __pte_offset_map(pmd, addr, NULL);
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, spinlock_t **ptlp);
+static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, spinlock_t **ptlp)
+{
+       pte_t *pte;
+
+       __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
+       return pte;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+                       unsigned long addr, spinlock_t **ptlp);
 
 #define pte_unmap_unlock(pte, ptl)     do {            \
        spin_unlock(ptl);                               \
index 94235ff2706ec5c18510128b9582c91cbffc0676..3fabbb01855772a0000a93966873a2d48595a43e 100644 (file)
@@ -94,14 +94,22 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 #define pte_offset_kernel pte_offset_kernel
 #endif
 
-#if defined(CONFIG_HIGHPTE)
-#define pte_offset_map(dir, address)                           \
-       ((pte_t *)kmap_local_page(pmd_page(*(dir))) +           \
-        pte_index((address)))
-#define pte_unmap(pte) kunmap_local((pte))
+#ifdef CONFIG_HIGHPTE
+#define __pte_map(pmd, address) \
+       ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
+#define pte_unmap(pte) do {    \
+       kunmap_local((pte));    \
+       /* rcu_read_unlock() to be added later */       \
+} while (0)
 #else
-#define pte_offset_map(dir, address)   pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) ((void)(pte))   /* NOP */
+static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
+{
+       return pte_offset_kernel(pmd, address);
+}
+static inline void pte_unmap(pte_t *pte)
+{
+       /* rcu_read_unlock() to be added later */
+}
 #endif
 
 /* Find an entry in the second-level page table.. */
index d2fc52bffafccda595d9a112711cff806e1f8d02..c7ab18a5fb77cffe9b924e932e90541ae6138bd7 100644 (file)
@@ -10,6 +10,8 @@
 #include <linux/pagemap.h>
 #include <linux/hugetlb.h>
 #include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/mm_inline.h>
 #include <asm/tlb.h>
 
@@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+{
+       pmd_t pmdval;
+
+       /* rcu_read_lock() to be added later */
+       pmdval = pmdp_get_lockless(pmd);
+       if (pmdvalp)
+               *pmdvalp = pmdval;
+       if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+               goto nomap;
+       if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+               goto nomap;
+       if (unlikely(pmd_bad(pmdval))) {
+               pmd_clear_bad(pmd);
+               goto nomap;
+       }
+       return __pte_map(&pmdval, addr);
+nomap:
+       /* rcu_read_unlock() to be added later */
+       return NULL;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+                            unsigned long addr, spinlock_t **ptlp)
+{
+       pmd_t pmdval;
+       pte_t *pte;
+
+       pte = __pte_offset_map(pmd, addr, &pmdval);
+       if (likely(pte))
+               *ptlp = pte_lockptr(mm, &pmdval);
+       return pte;
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+                            unsigned long addr, spinlock_t **ptlp)
+{
+       spinlock_t *ptl;
+       pmd_t pmdval;
+       pte_t *pte;
+again:
+       pte = __pte_offset_map(pmd, addr, &pmdval);
+       if (unlikely(!pte))
+               return pte;
+       ptl = pte_lockptr(mm, &pmdval);
+       spin_lock(ptl);
+       if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+               *ptlp = ptl;
+               return pte;
+       }
+       pte_unmap_unlock(pte, ptl);
+       goto again;
+}