mm: add spurious fault fixing support for huge pmd

The page faults may be spurious because of the racy access to the page
table.  For example, a non-populated virtual page is accessed on 2
CPUs simultaneously, thus the page faults are triggered on both CPUs.
However, it's possible that one CPU (say CPU A) cannot find the reason
for the page fault if the other CPU (say CPU B) has changed the page
table before the PTE is checked on CPU A.  Most of the time, the
spurious page faults can be ignored safely.  However, if the page
fault is for the write access, it's possible that a stale read-only
TLB entry exists in the local CPU and needs to be flushed on some
architectures.  This is called the spurious page fault fixing.

In the current kernel, there is spurious fault fixing support for pte,
but not for huge pmd because no architectures need it. But in the
next patch in the series, we will change the write protection fault
handling logic on arm64, so that some stale huge pmd entries may
remain in the TLB. These entries need to be flushed via the huge pmd
spurious fault fixing mechanism.

Signed-off-by: Huang Ying <ying.huang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
This commit is contained in:
Huang Ying 2025-11-14 16:54:02 +08:00 committed by Catalin Marinas
parent 3a86608788
commit 79301c7d60
5 changed files with 73 additions and 30 deletions

View File

@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
void huge_pmd_set_accessed(struct vm_fault *vmf);
bool huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma);

View File

@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif
#ifndef flush_tlb_fix_spurious_fault_pmd
#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
#endif
/*
* When walking page tables, get the address of the next boundary,
* or the end address of the range if that comes earlier. Although no

View File

@ -1641,17 +1641,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
/**
* touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
* @vma: The VMA covering @addr
* @addr: The virtual address
* @pmd: pmd pointer into the page table mapping @addr
* @write: Whether it's a write access
*
* Return: whether the pmd entry is changed
*/
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write)
{
pmd_t _pmd;
pmd_t entry;
_pmd = pmd_mkyoung(*pmd);
entry = pmd_mkyoung(*pmd);
if (write)
_pmd = pmd_mkdirty(_pmd);
entry = pmd_mkdirty(entry);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
pmd, _pmd, write))
pmd, entry, write)) {
update_mmu_cache_pmd(vma, addr, pmd);
return true;
}
return false;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@ -1841,18 +1854,14 @@ void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void huge_pmd_set_accessed(struct vm_fault *vmf)
bool huge_pmd_set_accessed(struct vm_fault *vmf)
{
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
goto unlock;
return false;
touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
unlock:
spin_unlock(vmf->ptl);
return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
}
static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)

View File

@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
*/
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, bool write);
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
/*

View File

@ -6115,6 +6115,45 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
return VM_FAULT_FALLBACK;
}
/*
* The page faults may be spurious because of the racy access to the
* page table. For example, a non-populated virtual page is accessed
* on 2 CPUs simultaneously, thus the page faults are triggered on
* both CPUs. However, it's possible that one CPU (say CPU A) cannot
* find the reason for the page fault if the other CPU (say CPU B) has
* changed the page table before the PTE is checked on CPU A. Most of
* the time, the spurious page faults can be ignored safely. However,
* if the page fault is for the write access, it's possible that a
* stale read-only TLB entry exists in the local CPU and needs to be
* flushed on some architectures. This is called the spurious page
* fault fixing.
*
* Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
* by default and used as such on most architectures, while
* flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
* used as such on most architectures.
*/
static void fix_spurious_fault(struct vm_fault *vmf,
enum pgtable_level ptlevel)
{
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
return;
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE) {
if (ptlevel == PGTABLE_LEVEL_PTE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
vmf->pte);
else
flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
vmf->pmd);
}
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@ -6196,23 +6235,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
vmf->flags & FAULT_FLAG_WRITE))
update_mmu_cache_range(vmf, vmf->vma, vmf->address,
vmf->pte, 1);
} else {
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
goto unlock;
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
vmf->pte);
}
else
fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@ -6309,7 +6336,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&vmf);
vmf.ptl = pmd_lock(mm, vmf.pmd);
if (!huge_pmd_set_accessed(&vmf))
fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
spin_unlock(vmf.ptl);
return 0;
}
}