diff --git a/MAINTAINERS b/MAINTAINERS index bc8631757f3d..c9b7b6f9828e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7303,6 +7303,14 @@ L: linux-cxl@vger.kernel.org S: Supported F: drivers/dax/ +DEVICE DIRECT ACCESS (DAX) [fsdev_dax] +M: John Groves +M: John Groves +L: nvdimm@lists.linux.dev +L: linux-cxl@vger.kernel.org +S: Supported +F: drivers/dax/fsdev.c + DEVICE FREQUENCY (DEVFREQ) M: MyungJoo Ham M: Kyungmin Park diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 504f7f735ef5..602f9a0839a9 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -65,6 +65,11 @@ config DEV_DAX_HMEM_DEVICES depends on DEV_DAX_HMEM && DAX def_bool y +config DEV_DAX_FSDEV + tristate + depends on DEV_DAX && FS_DAX + default DEV_DAX + config DEV_DAX_KMEM tristate "KMEM DAX: map dax-devices as System-RAM" default DEV_DAX diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 70e996bf1526..c0bc6eb1ea20 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -5,9 +5,11 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o +obj-$(CONFIG_DEV_DAX_FSDEV) += fsdev_dax.o dax-y := super.o dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o +fsdev_dax-y := fsdev.o diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 68437c05e21d..ccfe65004888 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -40,8 +40,6 @@ static int dax_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); } -#define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) - static struct dax_id *__dax_match_id(const struct dax_device_driver *dax_drv, const char *dev_name) { @@ -1431,6 +1429,26 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; +/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ +__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, + unsigned long size) +{ + for (int i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + phys_addr_t phys; + + if (!in_range(pgoff, dax_range->pgoff, PHYS_PFN(range_len(range)))) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) + return phys; + break; + } + return -1; +} +EXPORT_SYMBOL_GPL(dax_pgoff_to_phys); + static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data) { struct dax_region *dax_region = data->dax_region; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 7b1a83f1ce1f..5909171a4428 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -33,6 +33,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); enum dax_driver_type { DAXDRV_KMEM_TYPE, DAXDRV_DEVICE_TYPE, + DAXDRV_FSDEV_TYPE, }; struct dax_device_driver { @@ -43,6 +44,8 @@ struct dax_device_driver { void (*remove)(struct dev_dax *dev); }; +#define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) + int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index c6ae27c982f4..81e4af49e39c 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -69,6 +69,8 @@ struct dev_dax_range { * data while the device is activated in the driver. * @region: parent region * @dax_dev: core dax functionality + * @virt_addr: kva from memremap; used by fsdev_dax + * @cached_size: size of daxdev cached by fsdev_dax * @align: alignment of this instance * @target_node: effective numa node if dev_dax memory range is onlined * @dyn_id: is this a dynamic or statically created instance @@ -83,6 +85,8 @@ struct dev_dax_range { struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + void *virt_addr; + u64 cached_size; unsigned int align; int target_node; bool dyn_id; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 381021c2e031..d0c9b4e03b47 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -57,29 +57,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, vma->vm_file, func); } -/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ -__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, - unsigned long size) -{ - int i; - - for (i = 0; i < dev_dax->nr_range; i++) { - struct dev_dax_range *dax_range = &dev_dax->ranges[i]; - struct range *range = &dax_range->range; - unsigned long long pgoff_end; - phys_addr_t phys; - - pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; - if (pgoff < dax_range->pgoff || pgoff > pgoff_end) - continue; - phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; - if (phys + size - 1 <= range->end) - return phys; - break; - } - return -1; -} - static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, unsigned long fault_size) { diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c new file mode 100644 index 000000000000..188b2526bee4 --- /dev/null +++ b/drivers/dax/fsdev.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2026 Micron Technology, Inc. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +/* + * FS-DAX compatible devdax driver + * + * Unlike drivers/dax/device.c which pre-initializes compound folios based + * on device alignment (via vmemmap_shift), this driver leaves folios + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs + * to work without needing special handling for pre-initialized folios. + * + * Key differences from device.c: + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) + * - vmemmap_shift is NOT set (folios remain order-0) + * - fs-dax can dynamically create compound folios as needed + * - No mmap support - all access is through fs-dax/iomap + */ + +static void fsdev_write_dax(void *addr, struct page *page, + unsigned int off, unsigned int len) +{ + while (len) { + void *mem = kmap_local_page(page); + unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); + + memcpy_flushcache(addr, mem + off, chunk); + kunmap_local(mem); + len -= chunk; + off = 0; + page++; + addr += chunk; + } +} + +static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + unsigned long *pfn) +{ + struct dev_dax *dev_dax = dax_get_private(dax_dev); + size_t size = nr_pages << PAGE_SHIFT; + size_t offset = pgoff << PAGE_SHIFT; + void *virt_addr = dev_dax->virt_addr + offset; + phys_addr_t phys; + unsigned long local_pfn; + + phys = dax_pgoff_to_phys(dev_dax, pgoff, size); + if (phys == -1) { + dev_dbg(&dev_dax->dev, + "pgoff (%#lx) out of range\n", pgoff); + return -EFAULT; + } + + if (kaddr) + *kaddr = virt_addr; + + local_pfn = PHYS_PFN(phys); + if (pfn) + *pfn = local_pfn; + + /* + * Use cached_size which was computed at probe time. The size cannot + * change while the driver is bound (resize returns -EBUSY). + */ + return PHYS_PFN(min(size, dev_dax->cached_size - offset)); +} + +static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + void *kaddr; + long rc; + + WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); + rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + if (rc < 0) + return rc; + fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); + return 0; +} + +static long fsdev_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, unsigned long *pfn) +{ + return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, + kaddr, pfn); +} + +static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return _copy_from_iter_flushcache(addr, bytes, i); +} + +static const struct dax_operations dev_dax_ops = { + .direct_access = fsdev_dax_direct_access, + .zero_page_range = fsdev_dax_zero_page_range, + .recovery_write = fsdev_dax_recovery_write, +}; + +static void fsdev_cdev_del(void *cdev) +{ + cdev_del(cdev); +} + +static void fsdev_kill(void *dev_dax) +{ + kill_dev_dax(dev_dax); +} + +static void fsdev_clear_ops(void *data) +{ + struct dev_dax *dev_dax = data; + + dax_set_ops(dev_dax->dax_dev, NULL); +} + +/* + * Page map operations for FS-DAX mode + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c + * + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. + * The core mm code in free_zone_device_folio() handles the wake_up_var() + * directly for this memory type. + */ +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct dev_dax *dev_dax = pgmap->owner; + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(dev_dax->dax_dev, offset, + len, mf_flags); +} + +static const struct dev_pagemap_ops fsdev_pagemap_ops = { + .memory_failure = fsdev_pagemap_memory_failure, +}; + +/* + * Clear any stale folio state from pages in the given range. + * This is necessary because device_dax pre-initializes compound folios + * based on vmemmap_shift, and that state may persist after driver unbind. + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax + * expects to find clean order-0 folios that it can build into compound + * folios on demand. + * + * At probe time, no filesystem should be mounted yet, so all mappings + * are stale and must be cleared along with compound state. + */ +static void fsdev_clear_folio_state(struct dev_dax *dev_dax) +{ + for (int i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + unsigned long pfn = PHYS_PFN(range->start); + unsigned long end_pfn = PHYS_PFN(range->end) + 1; + + while (pfn < end_pfn) { + struct folio *folio = pfn_folio(pfn); + int order = dax_folio_reset_order(folio); + + pfn += 1UL << order; + } + } +} + +static void fsdev_clear_folio_state_action(void *data) +{ + fsdev_clear_folio_state(data); +} + +static int fsdev_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + filp->private_data = dev_dax; + + return 0; +} + +static int fsdev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static const struct file_operations fsdev_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = fsdev_open, + .release = fsdev_release, +}; + +static int fsdev_dax_probe(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; + struct inode *inode; + u64 data_offset = 0; + struct cdev *cdev; + void *addr; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + size_t pgmap_size; + + if (dev_dax->pgmap) { + dev_warn(dev, "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); + pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + pgmap->ranges[i] = *range; + } + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + } + + /* Cache size now; it cannot change while driver is bound */ + dev_dax->cached_size = 0; + for (i = 0; i < dev_dax->nr_range; i++) + dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); + + /* + * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving + * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this + * lets fs-dax dynamically build compound folios as needed, similar + * to pmem behavior. + */ + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->ops = &fsdev_pagemap_ops; + pgmap->owner = dev_dax; + + addr = devm_memremap_pages(dev, pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + /* + * Clear any stale compound folio state left over from a previous + * driver (e.g., device_dax with vmemmap_shift). Also register this + * as a devm action so folio state is cleared on unbind, ensuring + * clean pages for subsequent drivers (e.g., kmem for system-ram). + */ + fsdev_clear_folio_state(dev_dax); + rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, + dev_dax); + if (rc) + return rc; + + /* Detect whether the data is at a non-zero offset into the memory */ + if (pgmap->range.start != dev_dax->ranges[0].range.start) { + u64 phys = dev_dax->ranges[0].range.start; + u64 pgmap_phys = dev_dax->pgmap[0].range.start; + + if (!WARN_ON(pgmap_phys > phys)) + data_offset = phys - pgmap_phys; + + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", + __func__, phys, pgmap_phys, data_offset); + } + dev_dax->virt_addr = addr + data_offset; + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &fsdev_fops); + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); + if (rc) + return rc; + + /* Set the dax operations for fs-dax access path */ + rc = dax_set_ops(dax_dev, &dev_dax_ops); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); + if (rc) + return rc; + + run_dax(dax_dev); + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); +} + +static struct dax_device_driver fsdev_dax_driver = { + .probe = fsdev_dax_probe, + .type = DAXDRV_FSDEV_TYPE, +}; + +static int __init dax_init(void) +{ + return dax_driver_register(&fsdev_dax_driver); +} + +static void __exit dax_exit(void) +{ + dax_driver_unregister(&fsdev_dax_driver); +} + +MODULE_AUTHOR("John Groves"); +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); +MODULE_LICENSE("GPL"); +module_init(dax_init); +module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c00b9dff4a06..25cf99dd9360 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -14,6 +14,7 @@ #include #include #include "dax-private.h" +#include "bus.h" /** * struct dax_device - anchor object for dax services @@ -111,6 +112,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); +#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ + +#if IS_ENABLED(CONFIG_FS_DAX) + void fs_put_dax(struct dax_device *dax_dev, void *holder) { if (dax_dev && holder && @@ -119,7 +124,66 @@ void fs_put_dax(struct dax_device *dax_dev, void *holder) put_dax(dax_dev); } EXPORT_SYMBOL_GPL(fs_put_dax); -#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ + +/** + * fs_dax_get() - get ownership of a devdax via holder/holder_ops + * + * fs-dax file systems call this function to prepare to use a devdax device for + * fsdax. This is like fs_dax_get_by_bdev(), but the caller already has struct + * dev_dax (and there is no bdev). The holder makes this exclusive. + * + * @dax_dev: dev to be prepared for fs-dax usage + * @holder: filesystem or mapped device inside the dax_device + * @hops: operations for the inner holder + * + * Returns: 0 on success, <0 on failure + */ +int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops) +{ + struct dev_dax *dev_dax; + struct dax_device_driver *dax_drv; + int id; + + id = dax_read_lock(); + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) { + dax_read_unlock(id); + return -ENODEV; + } + dax_read_unlock(id); + + /* Verify the device is bound to fsdev_dax driver */ + dev_dax = dax_get_private(dax_dev); + if (!dev_dax) { + iput(&dax_dev->inode); + return -ENODEV; + } + + device_lock(&dev_dax->dev); + if (!dev_dax->dev.driver) { + device_unlock(&dev_dax->dev); + iput(&dax_dev->inode); + return -ENODEV; + } + dax_drv = to_dax_drv(dev_dax->dev.driver); + if (dax_drv->type != DAXDRV_FSDEV_TYPE) { + device_unlock(&dev_dax->dev); + iput(&dax_dev->inode); + return -EOPNOTSUPP; + } + device_unlock(&dev_dax->dev); + + if (cmpxchg(&dax_dev->holder_data, NULL, holder)) { + iput(&dax_dev->inode); + return -EBUSY; + } + + dax_dev->holder_ops = hops; + + return 0; +} +EXPORT_SYMBOL_GPL(fs_dax_get); +#endif /* CONFIG_FS_DAX */ enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ @@ -157,6 +221,9 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, if (!dax_alive(dax_dev)) return -ENXIO; + if (!dax_dev->ops) + return -EOPNOTSUPP; + if (nr_pages < 0) return -EINVAL; @@ -207,6 +274,10 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, if (!dax_alive(dax_dev)) return -ENXIO; + + if (!dax_dev->ops) + return -EOPNOTSUPP; + /* * There are no callers that want to zero more than one page as of now. * Once users are there, this check can be removed after the @@ -223,7 +294,7 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *iter) { - if (!dax_dev->ops->recovery_write) + if (!dax_dev->ops || !dax_dev->ops->recovery_write) return 0; return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); } @@ -307,6 +378,35 @@ void set_dax_nomc(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(set_dax_nomc); +/** + * dax_set_ops - set the dax_operations for a dax_device + * @dax_dev: the dax_device to configure + * @ops: the operations to set (may be NULL to clear) + * + * This allows drivers to set the dax_operations after the dax_device + * has been allocated. This is needed when the device is created before + * the driver that needs specific ops is bound (e.g., fsdev_dax binding + * to a dev_dax created by hmem). + * + * When setting non-NULL ops, fails if ops are already set (returns -EBUSY). + * When clearing ops (NULL), always succeeds. + * + * Return: 0 on success, -EBUSY if ops already set + */ +int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops) +{ + if (ops) { + /* Setting ops: fail if already set */ + if (cmpxchg(&dax_dev->ops, NULL, ops) != NULL) + return -EBUSY; + } else { + /* Clearing ops: always allowed */ + dax_dev->ops = NULL; + } + return 0; +} +EXPORT_SYMBOL_GPL(dax_set_ops); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); @@ -421,7 +521,7 @@ static int dax_set(struct inode *inode, void *data) return 0; } -static struct dax_device *dax_dev_get(dev_t devt) +struct dax_device *dax_dev_get(dev_t devt) { struct dax_device *dax_dev; struct inode *inode; @@ -444,6 +544,7 @@ static struct dax_device *dax_dev_get(dev_t devt) return dax_dev; } +EXPORT_SYMBOL_GPL(dax_dev_get); struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { diff --git a/fs/dax.c b/fs/dax.c index a5237169b467..6d175cd47a99 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -377,6 +377,59 @@ static void dax_folio_make_shared(struct folio *folio) folio->share = 1; } +/** + * dax_folio_reset_order - Reset a compound DAX folio to order-0 pages + * @folio: The folio to reset + * + * Splits a compound folio back into individual order-0 pages, + * clearing compound state and restoring pgmap pointers. + * + * Returns: the original folio order (0 if already order-0) + */ +int dax_folio_reset_order(struct folio *folio) +{ + struct dev_pagemap *pgmap = page_pgmap(&folio->page); + int order = folio_order(folio); + + /* + * DAX maintains the invariant that folio->share != 0 only when + * folio->mapping == NULL (enforced by dax_folio_make_shared()). + * Equivalently: folio->mapping != NULL implies folio->share == 0. + * Callers ensure share has been decremented to zero before + * calling here, so unconditionally clearing both fields is + * correct. + */ + folio->mapping = NULL; + folio->share = 0; + + if (!order) { + /* + * Restore pgmap explicitly even for order-0 folios. For the + * dax_folio_put() caller this is a no-op (same value), but + * fsdev_clear_folio_state() may call this on folios that + * were previously compound and need pgmap re-established. + */ + folio->pgmap = pgmap; + return 0; + } + + folio_reset_order(folio); + + for (int i = 0; i < (1UL << order); i++) { + struct page *page = folio_page(folio, i); + struct folio *f = (struct folio *)page; + + ClearPageHead(page); + clear_compound_head(page); + f->mapping = NULL; + f->share = 0; + f->pgmap = pgmap; + } + + return order; +} +EXPORT_SYMBOL_GPL(dax_folio_reset_order); + static inline unsigned long dax_folio_put(struct folio *folio) { unsigned long ref; @@ -390,28 +443,13 @@ static inline unsigned long dax_folio_put(struct folio *folio) if (ref) return ref; - folio->mapping = NULL; - order = folio_order(folio); - if (!order) - return 0; - folio_reset_order(folio); + order = dax_folio_reset_order(folio); + /* Debug check: verify refcounts are zero for all sub-folios */ for (i = 0; i < (1UL << order); i++) { - struct dev_pagemap *pgmap = page_pgmap(&folio->page); struct page *page = folio_page(folio, i); - struct folio *new_folio = (struct folio *)page; - ClearPageHead(page); - clear_compound_head(page); - - new_folio->mapping = NULL; - /* - * Reset pgmap which was over-written by - * prep_compound_page(). - */ - new_folio->pgmap = pgmap; - new_folio->share = 0; - WARN_ON_ONCE(folio_ref_count(new_folio)); + WARN_ON_ONCE(folio_ref_count((struct folio *)page)); } return ref; diff --git a/include/linux/dax.h b/include/linux/dax.h index 10a7cc79aea5..fe6c3ded1b50 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -54,6 +54,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); +struct dax_device *dax_dev_get(dev_t devt); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); @@ -130,7 +131,6 @@ int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops); -void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -145,14 +145,15 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) -{ -} #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) +void fs_put_dax(struct dax_device *dax_dev, void *holder); +int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops); int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); +int dax_folio_reset_order(struct folio *folio); struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); @@ -163,6 +164,15 @@ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie); #else +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ +} + +static inline int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops) +{ + return -EOPNOTSUPP; +} static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; @@ -242,6 +252,7 @@ static inline void dax_break_layout_final(struct inode *inode) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); +int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsigned long *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,