mm/rmap: convert make_device_exclusive_range() to make_device_exclusive()

The single "real" user in the tree of make_device_exclusive_range() always
requests making only a single address exclusive.  The current
implementation is hard to fix for properly supporting anonymous THP /
large folios and for avoiding messing with rmap walks in weird ways.

So let's always process a single address/page and return folio + page to
minimize page -> folio lookups.  This is a preparation for further
changes.

Reject any non-anonymous or hugetlb folios early, directly after GUP.

While at it, extend the documentation of make_device_exclusive() to
clarify some things.

Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Simona Vetter <simona.vetter@ffwll.ch>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yanteng Si <si.yanteng@linux.dev>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
David Hildenbrand 2025-02-10 20:37:45 +01:00 committed by Andrew Morton
parent bc3fe6805c
commit 599b684a78
7 changed files with 83 additions and 77 deletions

View File

@ -400,7 +400,7 @@ Exclusive access memory
Some devices have features such as atomic PTE bits that can be used to implement Some devices have features such as atomic PTE bits that can be used to implement
atomic access to system memory. To support atomic operations to a shared virtual atomic access to system memory. To support atomic operations to a shared virtual
memory page such a device needs access to that page which is exclusive of any memory page such a device needs access to that page which is exclusive of any
userspace access from the CPU. The ``make_device_exclusive_range()`` function userspace access from the CPU. The ``make_device_exclusive()`` function
can be used to make a memory range inaccessible from userspace. can be used to make a memory range inaccessible from userspace.
This replaces all mappings for pages in the given range with special swap This replaces all mappings for pages in the given range with special swap

View File

@ -326,7 +326,7 @@ devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``s
一些设备具有诸如原子PTE位的功能可以用来实现对系统内存的原子访问。为了支持对一 一些设备具有诸如原子PTE位的功能可以用来实现对系统内存的原子访问。为了支持对一
个共享的虚拟内存页的原子操作这样的设备需要对该页的访问是排他的而不是来自CPU 个共享的虚拟内存页的原子操作这样的设备需要对该页的访问是排他的而不是来自CPU
的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一
个内存范围不能从用户空间访问。 个内存范围不能从用户空间访问。
这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会

View File

@ -610,10 +610,9 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
notifier_seq = mmu_interval_read_begin(&notifier->notifier); notifier_seq = mmu_interval_read_begin(&notifier->notifier);
mmap_read_lock(mm); mmap_read_lock(mm);
ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE, page = make_device_exclusive(mm, start, drm->dev, &folio);
&page, drm->dev);
mmap_read_unlock(mm); mmap_read_unlock(mm);
if (ret <= 0 || !page) { if (IS_ERR(page)) {
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }

View File

@ -46,7 +46,7 @@ struct mmu_interval_notifier;
* @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
* longer have exclusive access to the page. When sent during creation of an * longer have exclusive access to the page. When sent during creation of an
* exclusive range the owner will be initialised to the value provided by the * exclusive range the owner will be initialised to the value provided by the
* caller of make_device_exclusive_range(), otherwise the owner will be NULL. * caller of make_device_exclusive(), otherwise the owner will be NULL.
*/ */
enum mmu_notifier_event { enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_UNMAP = 0,

View File

@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked,
void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags);
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct page **pages, void *owner, struct folio **foliop);
void *arg);
/* Avoid racy checks */ /* Avoid racy checks */
#define PVMW_SYNC (1 << 0) #define PVMW_SYNC (1 << 0)

View File

@ -780,10 +780,8 @@ static int dmirror_exclusive(struct dmirror *dmirror,
unsigned long start, end, addr; unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT; unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm; struct mm_struct *mm = dmirror->notifier.mm;
struct page *pages[64];
struct dmirror_bounce bounce; struct dmirror_bounce bounce;
unsigned long next; int ret = 0;
int ret;
start = cmd->addr; start = cmd->addr;
end = start + size; end = start + size;
@ -795,36 +793,27 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return -EINVAL; return -EINVAL;
mmap_read_lock(mm); mmap_read_lock(mm);
for (addr = start; addr < end; addr = next) { for (addr = start; !ret && addr < end; addr += PAGE_SIZE) {
unsigned long mapped = 0; struct folio *folio;
int i; struct page *page;
next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)); page = make_device_exclusive(mm, addr, NULL, &folio);
if (IS_ERR(page)) {
ret = make_device_exclusive_range(mm, addr, next, pages, NULL); ret = PTR_ERR(page);
/* break;
* Do dmirror_atomic_map() iff all pages are marked for
* exclusive access to avoid accessing uninitialized
* fields of pages.
*/
if (ret == (next - addr) >> PAGE_SHIFT)
mapped = dmirror_atomic_map(addr, next, pages, dmirror);
for (i = 0; i < ret; i++) {
if (pages[i]) {
unlock_page(pages[i]);
put_page(pages[i]);
}
} }
if (addr + (mapped << PAGE_SHIFT) < next) { ret = dmirror_atomic_map(addr, addr + PAGE_SIZE, &page, dmirror);
mmap_read_unlock(mm); ret = ret == 1 ? 0 : -EBUSY;
mmput(mm); folio_unlock(folio);
return -EBUSY; folio_put(folio);
}
} }
mmap_read_unlock(mm); mmap_read_unlock(mm);
mmput(mm); mmput(mm);
if (ret)
return ret;
/* Return the migrated data for verification. */ /* Return the migrated data for verification. */
ret = dmirror_bounce_init(&bounce, start, size); ret = dmirror_bounce_init(&bounce, start, size);
if (ret) if (ret)

103
mm/rmap.c
View File

@ -2495,70 +2495,89 @@ static bool folio_make_device_exclusive(struct folio *folio,
.arg = &args, .arg = &args,
}; };
/*
* Restrict to anonymous folios for now to avoid potential writeback
* issues.
*/
if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
return false;
rmap_walk(folio, &rwc); rmap_walk(folio, &rwc);
return args.valid && !folio_mapcount(folio); return args.valid && !folio_mapcount(folio);
} }
/** /**
* make_device_exclusive_range() - Mark a range for exclusive use by a device * make_device_exclusive() - Mark a page for exclusive use by a device
* @mm: mm_struct of associated target process * @mm: mm_struct of associated target process
* @start: start of the region to mark for exclusive device access * @addr: the virtual address to mark for exclusive device access
* @end: end address of region
* @pages: returns the pages which were successfully marked for exclusive access
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
* @foliop: folio pointer will be stored here on success.
* *
* Returns: number of pages found in the range by GUP. A page is marked for * This function looks up the page mapped at the given address, grabs a
* exclusive access only if the page pointer is non-NULL. * folio reference, locks the folio and replaces the PTE with special
* device-exclusive PFN swap entry, preventing access through the process
* page tables. The function will return with the folio locked and referenced.
* *
* This function finds ptes mapping page(s) to the given address range, locks * On fault, the device-exclusive entries are replaced with the original PTE
* them and replaces mappings with special swap entries preventing userspace CPU * under folio lock, after calling MMU notifiers.
* access. On fault these entries are replaced with the original mapping after *
* calling MMU notifiers. * Only anonymous non-hugetlb folios are supported and the VMA must have
* write permissions such that we can fault in the anonymous page writable
* in order to mark it exclusive. The caller must hold the mmap_lock in read
* mode.
* *
* A driver using this to program access from a device must use a mmu notifier * A driver using this to program access from a device must use a mmu notifier
* critical section to hold a device specific lock during programming. Once * critical section to hold a device specific lock during programming. Once
* programming is complete it should drop the page lock and reference after * programming is complete it should drop the folio lock and reference after
* which point CPU access to the page will revoke the exclusive access. * which point CPU access to the page will revoke the exclusive access.
*
* Notes:
* #. This function always operates on individual PTEs mapping individual
* pages. PMD-sized THPs are first remapped to be mapped by PTEs before
* the conversion happens on a single PTE corresponding to @addr.
* #. While concurrent access through the process page tables is prevented,
* concurrent access through other page references (e.g., earlier GUP
* invocation) is not handled and not supported.
* #. device-exclusive entries are considered "clean" and "old" by core-mm.
* Device drivers must update the folio state when informed by MMU
* notifiers.
*
* Returns: pointer to mapped page on success, otherwise a negative error.
*/ */
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct page **pages, void *owner, struct folio **foliop)
void *owner)
{ {
long npages = (end - start) >> PAGE_SHIFT; struct folio *folio;
long i; struct page *page;
long npages;
npages = get_user_pages_remote(mm, start, npages, mmap_assert_locked(mm);
/*
* Fault in the page writable and try to lock it; note that if the
* address would already be marked for exclusive use by a device,
* the GUP call would undo that first by triggering a fault.
*/
npages = get_user_pages_remote(mm, addr, 1,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
pages, NULL); &page, NULL);
if (npages < 0) if (npages != 1)
return npages; return ERR_PTR(npages);
folio = page_folio(page);
for (i = 0; i < npages; i++, start += PAGE_SIZE) { if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
struct folio *folio = page_folio(pages[i]); folio_put(folio);
if (PageTail(pages[i]) || !folio_trylock(folio)) { return ERR_PTR(-EOPNOTSUPP);
folio_put(folio);
pages[i] = NULL;
continue;
}
if (!folio_make_device_exclusive(folio, mm, start, owner)) {
folio_unlock(folio);
folio_put(folio);
pages[i] = NULL;
}
} }
return npages; if (!folio_trylock(folio)) {
folio_put(folio);
return ERR_PTR(-EBUSY);
}
if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EBUSY);
}
*foliop = folio;
return page;
} }
EXPORT_SYMBOL_GPL(make_device_exclusive_range); EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif #endif
void __put_anon_vma(struct anon_vma *anon_vma) void __put_anon_vma(struct anon_vma *anon_vma)