From a73cc506ad9f3798d33c78b212149b80d212111a Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:08 +0000 Subject: [PATCH 1/9] dax: move dax_pgoff_to_phys from [drivers/dax/] device.c to bus.c This function will be used by both device.c and fsdev.c, but both are loadable modules. Moving to bus.c puts it in core and makes it available to both. No code changes - just relocated. Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311c90eb-a582ff97-93ba-49f3-8140-6c5c4bf8bc62-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/bus.c | 20 ++++++++++++++++++++ drivers/dax/device.c | 23 ----------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index c94c09622516..1b412264bb36 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -1417,6 +1417,26 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; +/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ +__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, + unsigned long size) +{ + for (int i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + phys_addr_t phys; + + if (!in_range(pgoff, dax_range->pgoff, PHYS_PFN(range_len(range)))) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) + return phys; + break; + } + return -1; +} +EXPORT_SYMBOL_GPL(dax_pgoff_to_phys); + static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data) { struct dax_region *dax_region = data->dax_region; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 528e81240c4d..2d2dbfd35e94 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -57,29 +57,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, vma->vm_file, func); } -/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ -__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, - unsigned long size) -{ - int i; - - for (i = 0; i < dev_dax->nr_range; i++) { - struct dev_dax_range *dax_range = &dev_dax->ranges[i]; - struct range *range = &dax_range->range; - unsigned long long pgoff_end; - phys_addr_t phys; - - pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; - if (pgoff < dax_range->pgoff || pgoff > pgoff_end) - continue; - phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; - if (phys + size - 1 <= range->end) - return phys; - break; - } - return -1; -} - static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, unsigned long fault_size) { From 59eb73b98ae0b12fc9b39c08f0f5a5552cb02d1e Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:22 +0000 Subject: [PATCH 2/9] dax: Factor out dax_folio_reset_order() helper Both fs/dax.c:dax_folio_put() and drivers/dax/fsdev.c: fsdev_clear_folio_state() (the latter coming in the next commit after this one) contain nearly identical code to reset a compound DAX folio back to order-0 pages. Factor this out into a shared helper function. The new dax_folio_reset_order() function: - Clears the folio's mapping and share count - Resets compound folio state via folio_reset_order() - Clears PageHead and compound_head for each sub-page - Restores the pgmap pointer for each resulting order-0 folio - Returns the original folio order (for callers that need to advance by that many pages) Two intentional differences from the original dax_folio_put() logic: 1. folio->share is cleared unconditionally. This is correct because the DAX subsystem maintains the invariant that share != 0 only when mapping == NULL (enforced by dax_folio_make_shared()). dax_folio_put() ensures share has reached zero before calling this helper, so the unconditional clear is safe. 2. folio->pgmap is now explicitly restored for order-0 folios. For the dax_folio_put() caller this is a no-op (reads and writes back the same field). It is intentional for the upcoming fsdev_clear_folio_state() caller, which converts previously-compound folios and needs pgmap re-established for all pages regardless of order. This simplifies fsdev_clear_folio_state() from ~50 lines to ~15 lines. Suggested-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311cc6b9-5be7428a-7f16-4774-8f90-a44b88ac5660-000000@email.amazonses.com Signed-off-by: Ira Weiny --- fs/dax.c | 73 ++++++++++++++++++++++++++++++++++----------- include/linux/dax.h | 1 + 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 289e6254aa30..87bed6de920d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -378,6 +378,58 @@ static void dax_folio_make_shared(struct folio *folio) folio->share = 1; } +/** + * dax_folio_reset_order - Reset a compound DAX folio to order-0 pages + * @folio: The folio to reset + * + * Splits a compound folio back into individual order-0 pages, + * clearing compound state and restoring pgmap pointers. + * + * Returns: the original folio order (0 if already order-0) + */ +int dax_folio_reset_order(struct folio *folio) +{ + struct dev_pagemap *pgmap = page_pgmap(&folio->page); + int order = folio_order(folio); + + /* + * DAX maintains the invariant that folio->share != 0 only when + * folio->mapping == NULL (enforced by dax_folio_make_shared()). + * Equivalently: folio->mapping != NULL implies folio->share == 0. + * Callers ensure share has been decremented to zero before + * calling here, so unconditionally clearing both fields is + * correct. + */ + folio->mapping = NULL; + folio->share = 0; + + if (!order) { + /* + * Restore pgmap explicitly even for order-0 folios. For the + * dax_folio_put() caller this is a no-op (same value), but + * fsdev_clear_folio_state() may call this on folios that + * were previously compound and need pgmap re-established. + */ + folio->pgmap = pgmap; + return 0; + } + + folio_reset_order(folio); + + for (int i = 0; i < (1UL << order); i++) { + struct page *page = folio_page(folio, i); + struct folio *f = (struct folio *)page; + + ClearPageHead(page); + clear_compound_head(page); + f->mapping = NULL; + f->share = 0; + f->pgmap = pgmap; + } + + return order; +} + static inline unsigned long dax_folio_put(struct folio *folio) { unsigned long ref; @@ -391,28 +443,13 @@ static inline unsigned long dax_folio_put(struct folio *folio) if (ref) return ref; - folio->mapping = NULL; - order = folio_order(folio); - if (!order) - return 0; - folio_reset_order(folio); + order = dax_folio_reset_order(folio); + /* Debug check: verify refcounts are zero for all sub-folios */ for (i = 0; i < (1UL << order); i++) { - struct dev_pagemap *pgmap = page_pgmap(&folio->page); struct page *page = folio_page(folio, i); - struct folio *new_folio = (struct folio *)page; - ClearPageHead(page); - clear_compound_head(page); - - new_folio->mapping = NULL; - /* - * Reset pgmap which was over-written by - * prep_compound_page(). - */ - new_folio->pgmap = pgmap; - new_folio->share = 0; - WARN_ON_ONCE(folio_ref_count(new_folio)); + WARN_ON_ONCE(folio_ref_count((struct folio *)page)); } return ref; diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..73cfc1a7c8f1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -153,6 +153,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) #if IS_ENABLED(CONFIG_FS_DAX) int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); +int dax_folio_reset_order(struct folio *folio); struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); From d5406bd458b0ac10b1301a4d5801d85c8f648637 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:35 +0000 Subject: [PATCH 3/9] dax: add fsdev.c driver for fs-dax on character dax The new fsdev driver provides pages/folios initialized compatibly with fsdax - normal rather than devdax-style refcounting, and starting out with order-0 folios. When fsdev binds to a daxdev, it is usually (always?) switching from the devdax mode (device.c), which pre-initializes compound folios according to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the folios into a fsdax-compatible state. A side effect of this is that raw mmap doesn't (can't?) work on an fsdev dax instance. Accordingly, The fsdev driver does not provide raw mmap - devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw mmap capability. In this commit is just the framework, which remaps pages/folios compatibly with fsdax. Enabling dax changes: - bus.h: add DAXDRV_FSDEV_TYPE driver type - bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs - dax.h: prototype inode_dax(), which fsdev needs Suggested-by: Dan Williams Suggested-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311cf904-419e9526-bdaf-4daa-97f1-5060b31a5c9f-000000@email.amazonses.com Signed-off-by: Ira Weiny --- MAINTAINERS | 8 ++ drivers/dax/Kconfig | 5 + drivers/dax/Makefile | 2 + drivers/dax/bus.h | 1 + drivers/dax/fsdev.c | 245 +++++++++++++++++++++++++++++++++++++++++++ fs/dax.c | 1 + 6 files changed, 262 insertions(+) create mode 100644 drivers/dax/fsdev.c diff --git a/MAINTAINERS b/MAINTAINERS index c3fe46d7c4bc..ac49067c64ee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7298,6 +7298,14 @@ L: linux-cxl@vger.kernel.org S: Supported F: drivers/dax/ +DEVICE DIRECT ACCESS (DAX) [fsdev_dax] +M: John Groves +M: John Groves +L: nvdimm@lists.linux.dev +L: linux-cxl@vger.kernel.org +S: Supported +F: drivers/dax/fsdev.c + DEVICE FREQUENCY (DEVFREQ) M: MyungJoo Ham M: Kyungmin Park diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb84..6d8493cc540c 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -61,6 +61,11 @@ config DEV_DAX_HMEM_DEVICES depends on DEV_DAX_HMEM && DAX def_bool y +config DEV_DAX_FSDEV + tristate + depends on DEV_DAX && FS_DAX + default DEV_DAX + config DEV_DAX_KMEM tristate "KMEM DAX: map dax-devices as System-RAM" default DEV_DAX diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 5ed5c39857c8..ba35bda7abef 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -4,11 +4,13 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o +obj-$(CONFIG_DEV_DAX_FSDEV) += fsdev_dax.o dax-y := super.o dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o +fsdev_dax-y := fsdev.o obj-y += hmem/ diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index cbbf64443098..880bdf7e72d7 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -31,6 +31,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); enum dax_driver_type { DAXDRV_KMEM_TYPE, DAXDRV_DEVICE_TYPE, + DAXDRV_FSDEV_TYPE, }; struct dax_device_driver { diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c new file mode 100644 index 000000000000..8b5c6976ad17 --- /dev/null +++ b/drivers/dax/fsdev.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2026 Micron Technology, Inc. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +/* + * FS-DAX compatible devdax driver + * + * Unlike drivers/dax/device.c which pre-initializes compound folios based + * on device alignment (via vmemmap_shift), this driver leaves folios + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs + * to work without needing special handling for pre-initialized folios. + * + * Key differences from device.c: + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) + * - vmemmap_shift is NOT set (folios remain order-0) + * - fs-dax can dynamically create compound folios as needed + * - No mmap support - all access is through fs-dax/iomap + */ + +static void fsdev_cdev_del(void *cdev) +{ + cdev_del(cdev); +} + +static void fsdev_kill(void *dev_dax) +{ + kill_dev_dax(dev_dax); +} + +/* + * Page map operations for FS-DAX mode + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c + * + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. + * The core mm code in free_zone_device_folio() handles the wake_up_var() + * directly for this memory type. + */ +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct dev_dax *dev_dax = pgmap->owner; + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(dev_dax->dax_dev, offset, + len, mf_flags); +} + +static const struct dev_pagemap_ops fsdev_pagemap_ops = { + .memory_failure = fsdev_pagemap_memory_failure, +}; + +/* + * Clear any stale folio state from pages in the given range. + * This is necessary because device_dax pre-initializes compound folios + * based on vmemmap_shift, and that state may persist after driver unbind. + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax + * expects to find clean order-0 folios that it can build into compound + * folios on demand. + * + * At probe time, no filesystem should be mounted yet, so all mappings + * are stale and must be cleared along with compound state. + */ +static void fsdev_clear_folio_state(struct dev_dax *dev_dax) +{ + for (int i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + unsigned long pfn = PHYS_PFN(range->start); + unsigned long end_pfn = PHYS_PFN(range->end) + 1; + + while (pfn < end_pfn) { + struct folio *folio = pfn_folio(pfn); + int order = dax_folio_reset_order(folio); + + pfn += 1UL << order; + } + } +} + +static void fsdev_clear_folio_state_action(void *data) +{ + fsdev_clear_folio_state(data); +} + +static int fsdev_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + filp->private_data = dev_dax; + + return 0; +} + +static int fsdev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static const struct file_operations fsdev_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = fsdev_open, + .release = fsdev_release, +}; + +static int fsdev_dax_probe(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; + struct inode *inode; + struct cdev *cdev; + void *addr; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + size_t pgmap_size; + + if (dev_dax->pgmap) { + dev_warn(dev, "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); + pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + pgmap->ranges[i] = *range; + } + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + } + + /* + * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving + * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this + * lets fs-dax dynamically build compound folios as needed, similar + * to pmem behavior. + */ + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->ops = &fsdev_pagemap_ops; + pgmap->owner = dev_dax; + + addr = devm_memremap_pages(dev, pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + /* + * Clear any stale compound folio state left over from a previous + * driver (e.g., device_dax with vmemmap_shift). Also register this + * as a devm action so folio state is cleared on unbind, ensuring + * clean pages for subsequent drivers (e.g., kmem for system-ram). + */ + fsdev_clear_folio_state(dev_dax); + rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, + dev_dax); + if (rc) + return rc; + + /* Detect whether the data is at a non-zero offset into the memory */ + if (pgmap->range.start != dev_dax->ranges[0].range.start) { + u64 phys = dev_dax->ranges[0].range.start; + u64 pgmap_phys = dev_dax->pgmap[0].range.start; + u64 data_offset = 0; + + if (!WARN_ON(pgmap_phys > phys)) + data_offset = phys - pgmap_phys; + + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", + __func__, phys, pgmap_phys, data_offset); + } + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &fsdev_fops); + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); + if (rc) + return rc; + + run_dax(dax_dev); + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); +} + +static struct dax_device_driver fsdev_dax_driver = { + .probe = fsdev_dax_probe, + .type = DAXDRV_FSDEV_TYPE, +}; + +static int __init dax_init(void) +{ + return dax_driver_register(&fsdev_dax_driver); +} + +static void __exit dax_exit(void) +{ + dax_driver_unregister(&fsdev_dax_driver); +} + +MODULE_AUTHOR("John Groves"); +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); +MODULE_LICENSE("GPL"); +module_init(dax_init); +module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/fs/dax.c b/fs/dax.c index 87bed6de920d..cb5c87e43bc3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -429,6 +429,7 @@ int dax_folio_reset_order(struct folio *folio) return order; } +EXPORT_SYMBOL_GPL(dax_folio_reset_order); static inline unsigned long dax_folio_put(struct folio *folio) { From 759455848df0b9ac3acabdbedcdc4a55af67935f Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:44 +0000 Subject: [PATCH 4/9] dax: Save the kva from memremap Save the kva from memremap because we need it for iomap rw support. Prior to famfs, there were no iomap users of /dev/dax - so the virtual address from memremap was not needed. Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d1d08-dd372cb9-5934-43b8-bef8-089660d04a81-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/dax-private.h | 2 ++ drivers/dax/fsdev.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index c6ae27c982f4..7a3727d76a68 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -69,6 +69,7 @@ struct dev_dax_range { * data while the device is activated in the driver. * @region: parent region * @dax_dev: core dax functionality + * @virt_addr: kva from memremap; used by fsdev_dax * @align: alignment of this instance * @target_node: effective numa node if dev_dax memory range is onlined * @dyn_id: is this a dynamic or statically created instance @@ -83,6 +84,7 @@ struct dev_dax_range { struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + void *virt_addr; unsigned int align; int target_node; bool dyn_id; diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 8b5c6976ad17..c75478d3d548 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -121,6 +121,7 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) struct device *dev = &dev_dax->dev; struct dev_pagemap *pgmap; struct inode *inode; + u64 data_offset = 0; struct cdev *cdev; void *addr; int rc, i; @@ -196,7 +197,6 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) if (pgmap->range.start != dev_dax->ranges[0].range.start) { u64 phys = dev_dax->ranges[0].range.start; u64 pgmap_phys = dev_dax->pgmap[0].range.start; - u64 data_offset = 0; if (!WARN_ON(pgmap_phys > phys)) data_offset = phys - pgmap_phys; @@ -204,6 +204,7 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", __func__, phys, pgmap_phys, data_offset); } + dev_dax->virt_addr = addr + data_offset; inode = dax_inode(dax_dev); cdev = inode->i_cdev; From 099c81a1f0ab3e948d73c5ab2b7a3b702af36e64 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:54 +0000 Subject: [PATCH 5/9] dax: Add dax_operations for use by fs-dax on fsdev dax fsdev: Add dax_operations for use by famfs. This replicates the functionality from drivers/nvdimm/pmem.c that conventional fs-dax file systems (e.g. xfs) use to support dax read/write/mmap to a daxdev - without which famfs can't sit atop a daxdev. - These methods are based on pmem_dax_ops from drivers/nvdimm/pmem.c - fsdev_dax_direct_access() returns the hpa, pfn and kva. The kva was newly stored as dev_dax->virt_addr by dev_dax_probe(). - The hpa/pfn are used for mmap (dax_iomap_fault()), and the kva is used for read/write (dax_iomap_rw()) - fsdev_dax_recovery_write() and dev_dax_zero_page_range() have not been tested yet. I'm looking for suggestions as to how to test those. - dax-private.h: add dev_dax->cached_size, which fsdev needs to remember. The dev_dax size cannot change while a driver is bound (dev_dax_resize returns -EBUSY if dev->driver is set). Caching the size at probe time allows fsdev's direct_access path can use it without acquiring dax_dev_rwsem (which isn't exported anyway). Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d415a-bd6af0fe-5445-484c-9d39-210b8170b686-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/dax-private.h | 2 + drivers/dax/fsdev.c | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 7a3727d76a68..81e4af49e39c 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -70,6 +70,7 @@ struct dev_dax_range { * @region: parent region * @dax_dev: core dax functionality * @virt_addr: kva from memremap; used by fsdev_dax + * @cached_size: size of daxdev cached by fsdev_dax * @align: alignment of this instance * @target_node: effective numa node if dev_dax memory range is onlined * @dyn_id: is this a dynamic or statically created instance @@ -85,6 +86,7 @@ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; void *virt_addr; + u64 cached_size; unsigned int align; int target_node; bool dyn_id; diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index c75478d3d548..30f57c74c979 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -28,6 +28,85 @@ * - No mmap support - all access is through fs-dax/iomap */ +static void fsdev_write_dax(void *addr, struct page *page, + unsigned int off, unsigned int len) +{ + while (len) { + void *mem = kmap_local_page(page); + unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); + + memcpy_flushcache(addr, mem + off, chunk); + kunmap_local(mem); + len -= chunk; + off = 0; + page++; + addr += chunk; + } +} + +static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + unsigned long *pfn) +{ + struct dev_dax *dev_dax = dax_get_private(dax_dev); + size_t size = nr_pages << PAGE_SHIFT; + size_t offset = pgoff << PAGE_SHIFT; + void *virt_addr = dev_dax->virt_addr + offset; + phys_addr_t phys; + unsigned long local_pfn; + + phys = dax_pgoff_to_phys(dev_dax, pgoff, size); + if (phys == -1) { + dev_dbg(&dev_dax->dev, + "pgoff (%#lx) out of range\n", pgoff); + return -EFAULT; + } + + if (kaddr) + *kaddr = virt_addr; + + local_pfn = PHYS_PFN(phys); + if (pfn) + *pfn = local_pfn; + + /* + * Use cached_size which was computed at probe time. The size cannot + * change while the driver is bound (resize returns -EBUSY). + */ + return PHYS_PFN(min(size, dev_dax->cached_size - offset)); +} + +static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + void *kaddr; + + WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); + __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); + return 0; +} + +static long fsdev_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, unsigned long *pfn) +{ + return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, + kaddr, pfn); +} + +static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return _copy_from_iter_flushcache(addr, bytes, i); +} + +static const struct dax_operations dev_dax_ops = { + .direct_access = fsdev_dax_direct_access, + .zero_page_range = fsdev_dax_zero_page_range, + .recovery_write = fsdev_dax_recovery_write, +}; + static void fsdev_cdev_del(void *cdev) { cdev_del(cdev); @@ -167,6 +246,11 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) } } + /* Cache size now; it cannot change while driver is bound */ + dev_dax->cached_size = 0; + for (i = 0; i < dev_dax->nr_range; i++) + dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); + /* * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this From 700ecbc1f5aa02ba9ad68d7be1ef7a9c8eae07e9 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:03 +0000 Subject: [PATCH 6/9] dax: Add dax_set_ops() for setting dax_operations at bind time Add a new dax_set_ops() function that allows drivers to set the dax_operations after the dax_device has been allocated. This is needed for fsdev_dax where the operations need to be set during probe and cleared during unbind. The fsdev driver uses devm_add_action_or_reset() for cleanup consistency, avoiding the complexity of mixing devm-managed resources with manual cleanup in a remove() callback. This ensures cleanup happens automatically in the correct reverse order when the device is unbound. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d65a0-b9c1419e-f3a0-4afd-b0bd-848f18ff5950-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 16 ++++++++++++++++ drivers/dax/super.c | 38 +++++++++++++++++++++++++++++++++++++- include/linux/dax.h | 1 + 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 30f57c74c979..4499d9621f33 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -117,6 +117,13 @@ static void fsdev_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static void fsdev_clear_ops(void *data) +{ + struct dev_dax *dev_dax = data; + + dax_set_ops(dev_dax->dax_dev, NULL); +} + /* * Page map operations for FS-DAX mode * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c @@ -303,6 +310,15 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) if (rc) return rc; + /* Set the dax operations for fs-dax access path */ + rc = dax_set_ops(dax_dev, &dev_dax_ops); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); + if (rc) + return rc; + run_dax(dax_dev); return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); } diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c00b9dff4a06..ba0b4cd18a77 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -157,6 +157,9 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, if (!dax_alive(dax_dev)) return -ENXIO; + if (!dax_dev->ops) + return -EOPNOTSUPP; + if (nr_pages < 0) return -EINVAL; @@ -207,6 +210,10 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, if (!dax_alive(dax_dev)) return -ENXIO; + + if (!dax_dev->ops) + return -EOPNOTSUPP; + /* * There are no callers that want to zero more than one page as of now. * Once users are there, this check can be removed after the @@ -223,7 +230,7 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *iter) { - if (!dax_dev->ops->recovery_write) + if (!dax_dev->ops || !dax_dev->ops->recovery_write) return 0; return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, iter); } @@ -307,6 +314,35 @@ void set_dax_nomc(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(set_dax_nomc); +/** + * dax_set_ops - set the dax_operations for a dax_device + * @dax_dev: the dax_device to configure + * @ops: the operations to set (may be NULL to clear) + * + * This allows drivers to set the dax_operations after the dax_device + * has been allocated. This is needed when the device is created before + * the driver that needs specific ops is bound (e.g., fsdev_dax binding + * to a dev_dax created by hmem). + * + * When setting non-NULL ops, fails if ops are already set (returns -EBUSY). + * When clearing ops (NULL), always succeeds. + * + * Return: 0 on success, -EBUSY if ops already set + */ +int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops) +{ + if (ops) { + /* Setting ops: fail if already set */ + if (cmpxchg(&dax_dev->ops, NULL, ops) != NULL) + return -EBUSY; + } else { + /* Clearing ops: always allowed */ + dax_dev->ops = NULL; + } + return 0; +} +EXPORT_SYMBOL_GPL(dax_set_ops); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/include/linux/dax.h b/include/linux/dax.h index 73cfc1a7c8f1..b19bfe0c2fd1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -243,6 +243,7 @@ static inline void dax_break_layout_final(struct inode *inode) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); +int dax_set_ops(struct dax_device *dax_dev, const struct dax_operations *ops); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsigned long *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, From eec38f5d86d27535509c99f02ccc642ceb0c3e2a Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:12 +0000 Subject: [PATCH 7/9] dax: Add fs_dax_get() func to prepare dax for fs-dax usage The fs_dax_get() function should be called by fs-dax file systems after opening a fsdev dax device. This adds holder_operations, which provides a memory failure callback path and effects exclusivity between callers of fs_dax_get(). fs_dax_get() is specific to fsdev_dax, so it checks the driver type (which required touching bus.[ch]). fs_dax_get() fails if fsdev_dax is not bound to the memory. This function serves the same role as fs_dax_get_by_bdev(), which dax file systems call after opening the pmem block device. This can't be located in fsdev.c because struct dax_device is opaque there. This will be called by fs/fuse/famfs.c in a subsequent commit. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d8750-75395c22-031b-4d5f-aebe-790dca656b87-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/bus.c | 2 -- drivers/dax/bus.h | 2 ++ drivers/dax/super.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/dax.h | 16 ++++++++--- 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 1b412264bb36..32f7b7702c28 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -39,8 +39,6 @@ static int dax_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); } -#define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) - static struct dax_id *__dax_match_id(const struct dax_device_driver *dax_drv, const char *dev_name) { diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 880bdf7e72d7..dc6f112ac4a4 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -42,6 +42,8 @@ struct dax_device_driver { void (*remove)(struct dev_dax *dev); }; +#define to_dax_drv(__drv) container_of_const(__drv, struct dax_device_driver, drv) + int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ diff --git a/drivers/dax/super.c b/drivers/dax/super.c index ba0b4cd18a77..d4ab60c406bf 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -14,6 +14,7 @@ #include #include #include "dax-private.h" +#include "bus.h" /** * struct dax_device - anchor object for dax services @@ -111,6 +112,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); +#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ + +#if IS_ENABLED(CONFIG_FS_DAX) + void fs_put_dax(struct dax_device *dax_dev, void *holder) { if (dax_dev && holder && @@ -119,7 +124,66 @@ void fs_put_dax(struct dax_device *dax_dev, void *holder) put_dax(dax_dev); } EXPORT_SYMBOL_GPL(fs_put_dax); -#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ + +/** + * fs_dax_get() - get ownership of a devdax via holder/holder_ops + * + * fs-dax file systems call this function to prepare to use a devdax device for + * fsdax. This is like fs_dax_get_by_bdev(), but the caller already has struct + * dev_dax (and there is no bdev). The holder makes this exclusive. + * + * @dax_dev: dev to be prepared for fs-dax usage + * @holder: filesystem or mapped device inside the dax_device + * @hops: operations for the inner holder + * + * Returns: 0 on success, <0 on failure + */ +int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops) +{ + struct dev_dax *dev_dax; + struct dax_device_driver *dax_drv; + int id; + + id = dax_read_lock(); + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) { + dax_read_unlock(id); + return -ENODEV; + } + dax_read_unlock(id); + + /* Verify the device is bound to fsdev_dax driver */ + dev_dax = dax_get_private(dax_dev); + if (!dev_dax) { + iput(&dax_dev->inode); + return -ENODEV; + } + + device_lock(&dev_dax->dev); + if (!dev_dax->dev.driver) { + device_unlock(&dev_dax->dev); + iput(&dax_dev->inode); + return -ENODEV; + } + dax_drv = to_dax_drv(dev_dax->dev.driver); + if (dax_drv->type != DAXDRV_FSDEV_TYPE) { + device_unlock(&dev_dax->dev); + iput(&dax_dev->inode); + return -EOPNOTSUPP; + } + device_unlock(&dev_dax->dev); + + if (cmpxchg(&dax_dev->holder_data, NULL, holder)) { + iput(&dax_dev->inode); + return -EBUSY; + } + + dax_dev->holder_ops = hops; + + return 0; +} +EXPORT_SYMBOL_GPL(fs_dax_get); +#endif /* CONFIG_FS_DAX */ enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ diff --git a/include/linux/dax.h b/include/linux/dax.h index b19bfe0c2fd1..a85e270bfb3c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -130,7 +130,6 @@ int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, void *holder, const struct dax_holder_operations *ops); -void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -145,12 +144,12 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) -{ -} #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ #if IS_ENABLED(CONFIG_FS_DAX) +void fs_put_dax(struct dax_device *dax_dev, void *holder); +int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops); int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); int dax_folio_reset_order(struct folio *folio); @@ -164,6 +163,15 @@ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, void dax_unlock_mapping_entry(struct address_space *mapping, unsigned long index, dax_entry_t cookie); #else +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ +} + +static inline int fs_dax_get(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *hops) +{ + return -EOPNOTSUPP; +} static inline struct page *dax_layout_busy_page(struct address_space *mapping) { return NULL; From 2ae624d5a555d47a735fb3f4d850402859a4db77 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:21 +0000 Subject: [PATCH 8/9] dax: export dax_dev_get() famfs needs to look up a dax_device by dev_t when resolving fmap entries that reference character dax devices. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311daab5-bb212f0b-4e05-4668-bf53-d76fab56be68-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/super.c | 3 ++- include/linux/dax.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index d4ab60c406bf..25cf99dd9360 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -521,7 +521,7 @@ static int dax_set(struct inode *inode, void *data) return 0; } -static struct dax_device *dax_dev_get(dev_t devt) +struct dax_device *dax_dev_get(dev_t devt) { struct dax_device *dax_dev; struct inode *inode; @@ -544,6 +544,7 @@ static struct dax_device *dax_dev_get(dev_t devt) return dax_dev; } +EXPORT_SYMBOL_GPL(dax_dev_get); struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { diff --git a/include/linux/dax.h b/include/linux/dax.h index a85e270bfb3c..9ef95b136bb8 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -54,6 +54,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); +struct dax_device *dax_dev_get(dev_t devt); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); From 45df9111692c62d5f09fc4345ae36dae31024797 Mon Sep 17 00:00:00 2001 From: John Groves Date: Sun, 12 Apr 2026 15:50:06 +0000 Subject: [PATCH 9/9] dax/fsdev: fix uninitialized kaddr in fsdev_dax_zero_page_range() __fsdev_dax_direct_access() returns -EFAULT without setting *kaddr when dax_pgoff_to_phys() returns -1 (pgoff out of range). The return value was ignored, leaving kaddr uninitialized before being passed to fsdev_write_dax(). Check the return value and propagate the error. Thanks to Dan Carpenter and the smatch project for reporting this. Signed-off-by: John Groves Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/0100019d8262cda2-9714d31c-8fc1-4ca5-b32d-4df678240d14-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 4499d9621f33..188b2526bee4 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -80,9 +80,12 @@ static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { void *kaddr; + long rc; WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); - __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + if (rc < 0) + return rc; fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); return 0; }