Skip to content

Commit

Permalink
From patchwork series 420274
Browse files Browse the repository at this point in the history
  • Loading branch information
Fox Snowpatch committed Aug 21, 2024
1 parent ddf9a4c commit 97c5daa
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 31 deletions.
6 changes: 5 additions & 1 deletion Documentation/mm/split_page_table_lock.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ There are helpers to lock/unlock a table and other accessor functions:
- pte_offset_map_lock()
maps PTE and takes PTE table lock, returns pointer to PTE with
pointer to its PTE table lock, or returns NULL if no PTE table;
- pte_offset_map_nolock()
- pte_offset_map_readonly_nolock()
maps PTE, returns pointer to PTE with pointer to its PTE table
lock (not taken), or returns NULL if no PTE table;
- pte_offset_map_maywrite_nolock()
maps PTE, returns pointer to PTE with pointer to its PTE table
lock (not taken) and the value of its pmd entry, or returns NULL
if no PTE table;
- pte_offset_map()
maps PTE, returns pointer to PTE, or returns NULL if no PTE table;
- pte_unmap()
Expand Down
9 changes: 8 additions & 1 deletion arch/arm/mm/fault-armv.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pmd_t pmdval;
int ret;

pgd = pgd_offset(vma->vm_mm, address);
Expand All @@ -112,16 +113,22 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
if (pmd_none_or_clear_bad(pmd))
return 0;

again:
/*
* This is called while another page table is mapped, so we
* must use the nested version. This also means we need to
* open-code the spin-locking.
*/
pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl);
pte = pte_offset_map_maywrite_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl);
if (!pte)
return 0;

do_pte_lock(ptl);
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
do_pte_unlock(ptl);
pte_unmap(pte);
goto again;
}

ret = do_adjust_pte(vma, address, pfn, pte);

Expand Down
2 changes: 1 addition & 1 deletion arch/powerpc/mm/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
pte = pte_offset_map_readonly_nolock(mm, pmd, addr, &ptl);
BUG_ON(!pte);
assert_spin_locked(ptl);
pte_unmap(pte);
Expand Down
7 changes: 5 additions & 2 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2986,8 +2986,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
return pte;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_readonly_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_maywrite_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, pmd_t *pmdvalp,
spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl) do { \
spin_unlock(ptl); \
Expand Down
4 changes: 2 additions & 2 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -3231,8 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;

ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
ptep = pte_offset_map_readonly_nolock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;

Expand Down
39 changes: 36 additions & 3 deletions mm/khugepaged.c
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
};

if (!pte++) {
pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
/*
* Here the ptl is only used to check pte_same() in
* do_swap_page(), so readonly version is enough.
*/
pte = pte_offset_map_readonly_nolock(mm, pmd, address, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
Expand Down Expand Up @@ -1598,14 +1602,17 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);

start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
start_pte = pte_offset_map_maywrite_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
spin_lock(ptl);
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
goto abort;

/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
Expand Down Expand Up @@ -1651,6 +1658,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
/*
* We called pte_unmap() and release the ptl before acquiring
* the pml, which means we left the RCU critical section, so the
* PTE page may have been freed, so we must do pmd_same() check
* before reacquiring the ptl.
*/
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
spin_unlock(pml);
goto pmd_change;
}
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
}
Expand Down Expand Up @@ -1682,6 +1699,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
pte_unmap_unlock(start_pte, ptl);
if (pml && pml != ptl)
spin_unlock(pml);
pmd_change:
if (notified)
mmu_notifier_invalidate_range_end(&range);
drop_folio:
Expand All @@ -1703,6 +1721,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *pml;
spinlock_t *ptl;
bool skipped_uffd = false;
pte_t *pte;

/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
Expand Down Expand Up @@ -1738,11 +1757,25 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
addr, addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);

pte = pte_offset_map_maywrite_nolock(mm, pmd, addr, &pgt_pmd, &ptl);
if (!pte) {
mmu_notifier_invalidate_range_end(&range);
continue;
}

pml = pmd_lock(mm, pmd);
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
pte_unmap_unlock(pte, ptl);
if (ptl != pml)
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
continue;
}
pte_unmap(pte);

/*
* Huge page lock is still held, so normally the page table
* must remain empty; and we have already skipped anon_vma
Expand Down
13 changes: 10 additions & 3 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
pmd_t pmdval;

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (mpc885_ads_defconfig, korg-5.5.0)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc44x_defconfig, korg-5.5.0, /linux/arch/powerpc/configs/ppc44x-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (mpc885_ads, fedora-40, ppc64)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (mpc885_ads_defconfig, fedora-40)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc44x_defconfig, fedora-40, /linux/arch/powerpc/configs/ppc44x-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (corenet64_smp_defconfig, korg-5.5.0)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (pmac32_defconfig, korg-5.5.0, /linux/arch/powerpc/configs/pmac32-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (g5_defconfig, korg-5.5.0, /linux/arch/powerpc/configs/g5-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (corenet32_smp_defconfig, fedora-40)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (pmac32_defconfig, fedora-40, /linux/arch/powerpc/configs/pmac32-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (corenet64_smp_defconfig, fedora-40)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (pmac32, fedora-40, ppc64)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (g5_defconfig, fedora-40, /linux/arch/powerpc/configs/g5-qemu.config)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (corenet64_smp, fedora-40, ppc64)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64le_guest_defconfig, korg-5.5.0, ppc64le)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64_defconfig, korg-5.5.0)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64le, ppc64le, fedora-40)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64, fedora-40, ppc64)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64_defconfig, fedora-40)

unused variable 'pmdval' [-Wunused-variable]

Check warning on line 1086 in mm/memory.c

View workflow job for this annotation

GitHub Actions / kernel (ppc64le_guest_defconfig, fedora-40, ppc64le)

unused variable 'pmdval' [-Wunused-variable]
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
Expand All @@ -1108,7 +1109,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ret = -ENOMEM;
goto out;
}
src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
src_pte = pte_offset_map_maywrite_nolock(src_mm, src_pmd, addr, NULL,
&src_ptl);
if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
Expand Down Expand Up @@ -5504,9 +5506,14 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* pmd by anon khugepaged, since that takes mmap_lock in write
* mode; but shmem or file collapse to THP could still morph
* it into a huge pmd: just retry later if so.
*
* Use the maywrite version to indicate that vmf->pte will be
* modified, but since we will use pte_same() to detect the
* change of the pte entry, there is no need to get pmdval.
*/
vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
vmf->pte = pte_offset_map_maywrite_nolock(vmf->vma->vm_mm,
vmf->pmd, vmf->address,
NULL, &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
vmf->orig_pte = ptep_get_lockless(vmf->pte);
Expand Down
7 changes: 6 additions & 1 deletion mm/mremap.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,12 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
err = -EAGAIN;
goto out;
}
new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
/*
* Use the maywrite version to indicate that new_pte will be modified,
* but since we hold the exclusive mmap_lock, there is no need to
* recheck pmd_same() after acquiring the new_ptl.
*/
new_pte = pte_offset_map_maywrite_nolock(mm, new_pmd, new_addr, NULL, &new_ptl);
if (!new_pte) {
pte_unmap_unlock(old_pte, old_ptl);
err = -EAGAIN;
Expand Down
24 changes: 20 additions & 4 deletions mm/page_vma_mapped.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}

static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
spinlock_t **ptlp)
{
pte_t ptent;
pmd_t pmdval;

if (pvmw->flags & PVMW_SYNC) {
/* Use the stricter lookup */
Expand All @@ -25,17 +27,19 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
return !!pvmw->pte;
}

again:
/*
* It is important to return the ptl corresponding to pte,
* in case *pvmw->pmd changes underneath us; so we need to
* return it even when choosing not to lock, in case caller
* proceeds to loop over next ptes, and finds a match later.
* Though, in most cases, page lock already protects this.
*/
pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
pvmw->address, ptlp);
pvmw->pte = pte_offset_map_maywrite_nolock(pvmw->vma->vm_mm, pvmw->pmd,
pvmw->address, &pmdval, ptlp);
if (!pvmw->pte)
return false;
*pmdvalp = pmdval;

ptent = ptep_get(pvmw->pte);

Expand Down Expand Up @@ -69,6 +73,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
}
pvmw->ptl = *ptlp;
spin_lock(pvmw->ptl);

if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pvmw->pmd)))) {
spin_unlock(pvmw->ptl);
goto again;
}

return true;
}

Expand Down Expand Up @@ -278,7 +288,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
if (!map_pte(pvmw, &ptl)) {
if (!map_pte(pvmw, &pmde, &ptl)) {
if (!pvmw->pte)
goto restart;
goto next_pte;
Expand Down Expand Up @@ -307,6 +317,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->ptl) {
pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) {
pte_unmap_unlock(pvmw->pte, pvmw->ptl);
pvmw->ptl = NULL;
pvmw->pte = NULL;
goto restart;
}
}
goto this_pte;
} while (pvmw->address < end);
Expand Down
42 changes: 32 additions & 10 deletions mm/pgtable-generic.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,15 +305,30 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
return NULL;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp)
pte_t *pte_offset_map_readonly_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp)
{
pmd_t pmdval;
pte_t *pte;

pte = __pte_offset_map(pmd, addr, &pmdval);
if (likely(pte))
*ptlp = pte_lockptr(mm, &pmdval);
return pte;
}

pte_t *pte_offset_map_maywrite_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, pmd_t *pmdvalp,
spinlock_t **ptlp)
{
pmd_t pmdval;
pte_t *pte;

pte = __pte_offset_map(pmd, addr, &pmdval);
if (likely(pte))
*ptlp = pte_lockptr(mm, &pmdval);
if (pmdvalp)
*pmdvalp = pmdval;
return pte;
}

Expand Down Expand Up @@ -347,14 +362,21 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
* and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s
* afterwards.
*
* pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
* but when successful, it also outputs a pointer to the spinlock in ptlp - as
* pte_offset_map_lock() does, but in this case without locking it. This helps
* the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
* act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
* pointer for the page table that it returns. In principle, the caller should
* recheck *pmd once the lock is taken; in practice, no callsite needs that -
* either the mmap_lock for write, or pte_same() check on contents, is enough.
* pte_offset_map_readonly_nolock(mm, pmd, addr, ptlp), above, is like
* pte_offset_map(); but when successful, it also outputs a pointer to the
* spinlock in ptlp - as pte_offset_map_lock() does, but in this case without
* locking it. This helps the caller to avoid a later pte_lockptr(mm, *pmd),
* which might by that time act on a changed *pmd: pte_offset_map_readonly_nolock()
* provides the correct spinlock pointer for the page table that it returns.
* For readonly case, the caller does not need to recheck *pmd after the lock is
* taken, because the RCU lock will ensure that the PTE page will not be freed.
*
* pte_offset_map_maywrite_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
* pte_offset_map_readonly_nolock(); but when successful, it also outputs the
* pdmval. For cases where pte or pmd entries may be modified, that is, maywrite
* case, this can help the caller recheck *pmd once the lock is taken. In some
* cases we can pass NULL to pmdvalp: either the mmap_lock for write, or
* pte_same() check on contents, is also enough.
*
* Note that free_pgtables(), used after unmapping detached vmas, or when
* exiting the whole mm, does not take page table lock before freeing a page
Expand Down
12 changes: 10 additions & 2 deletions mm/userfaultfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1143,15 +1143,23 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
retry:
dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
/*
* Use the maywrite version to indicate that dst_pte will be modified,
* but since we will use pte_same() to detect the change of the pte
* entry, there is no need to get pmdval.
*/
dst_pte = pte_offset_map_maywrite_nolock(mm, dst_pmd, dst_addr, NULL,
&dst_ptl);

/* Retry if a huge pmd materialized from under us */
if (unlikely(!dst_pte)) {
err = -EAGAIN;
goto out;
}

src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
/* same as dst_pte */
src_pte = pte_offset_map_maywrite_nolock(mm, src_pmd, src_addr, NULL,
&src_ptl);

/*
* We held the mmap_lock for reading so MADV_DONTNEED
Expand Down
Loading

0 comments on commit 97c5daa

Please sign in to comment.