From d5406bd458b0ac10b1301a4d5801d85c8f648637 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:35 +0000 Subject: dax: add fsdev.c driver for fs-dax on character dax The new fsdev driver provides pages/folios initialized compatibly with fsdax - normal rather than devdax-style refcounting, and starting out with order-0 folios. When fsdev binds to a daxdev, it is usually (always?) switching from the devdax mode (device.c), which pre-initializes compound folios according to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the folios into a fsdax-compatible state. A side effect of this is that raw mmap doesn't (can't?) work on an fsdev dax instance. Accordingly, The fsdev driver does not provide raw mmap - devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw mmap capability. In this commit is just the framework, which remaps pages/folios compatibly with fsdax. Enabling dax changes: - bus.h: add DAXDRV_FSDEV_TYPE driver type - bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs - dax.h: prototype inode_dax(), which fsdev needs Suggested-by: Dan Williams Suggested-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311cf904-419e9526-bdaf-4daa-97f1-5060b31a5c9f-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 drivers/dax/fsdev.c (limited to 'drivers/dax/fsdev.c') diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c new file mode 100644 index 000000000000..8b5c6976ad17 --- /dev/null +++ b/drivers/dax/fsdev.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2026 Micron Technology, Inc. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +/* + * FS-DAX compatible devdax driver + * + * Unlike drivers/dax/device.c which pre-initializes compound folios based + * on device alignment (via vmemmap_shift), this driver leaves folios + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs + * to work without needing special handling for pre-initialized folios. + * + * Key differences from device.c: + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) + * - vmemmap_shift is NOT set (folios remain order-0) + * - fs-dax can dynamically create compound folios as needed + * - No mmap support - all access is through fs-dax/iomap + */ + +static void fsdev_cdev_del(void *cdev) +{ + cdev_del(cdev); +} + +static void fsdev_kill(void *dev_dax) +{ + kill_dev_dax(dev_dax); +} + +/* + * Page map operations for FS-DAX mode + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c + * + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. + * The core mm code in free_zone_device_folio() handles the wake_up_var() + * directly for this memory type. + */ +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct dev_dax *dev_dax = pgmap->owner; + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(dev_dax->dax_dev, offset, + len, mf_flags); +} + +static const struct dev_pagemap_ops fsdev_pagemap_ops = { + .memory_failure = fsdev_pagemap_memory_failure, +}; + +/* + * Clear any stale folio state from pages in the given range. + * This is necessary because device_dax pre-initializes compound folios + * based on vmemmap_shift, and that state may persist after driver unbind. + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax + * expects to find clean order-0 folios that it can build into compound + * folios on demand. + * + * At probe time, no filesystem should be mounted yet, so all mappings + * are stale and must be cleared along with compound state. + */ +static void fsdev_clear_folio_state(struct dev_dax *dev_dax) +{ + for (int i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + unsigned long pfn = PHYS_PFN(range->start); + unsigned long end_pfn = PHYS_PFN(range->end) + 1; + + while (pfn < end_pfn) { + struct folio *folio = pfn_folio(pfn); + int order = dax_folio_reset_order(folio); + + pfn += 1UL << order; + } + } +} + +static void fsdev_clear_folio_state_action(void *data) +{ + fsdev_clear_folio_state(data); +} + +static int fsdev_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + filp->private_data = dev_dax; + + return 0; +} + +static int fsdev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static const struct file_operations fsdev_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = fsdev_open, + .release = fsdev_release, +}; + +static int fsdev_dax_probe(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; + struct inode *inode; + struct cdev *cdev; + void *addr; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + size_t pgmap_size; + + if (dev_dax->pgmap) { + dev_warn(dev, "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); + pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + pgmap->ranges[i] = *range; + } + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + } + + /* + * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving + * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this + * lets fs-dax dynamically build compound folios as needed, similar + * to pmem behavior. + */ + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->ops = &fsdev_pagemap_ops; + pgmap->owner = dev_dax; + + addr = devm_memremap_pages(dev, pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + /* + * Clear any stale compound folio state left over from a previous + * driver (e.g., device_dax with vmemmap_shift). Also register this + * as a devm action so folio state is cleared on unbind, ensuring + * clean pages for subsequent drivers (e.g., kmem for system-ram). + */ + fsdev_clear_folio_state(dev_dax); + rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, + dev_dax); + if (rc) + return rc; + + /* Detect whether the data is at a non-zero offset into the memory */ + if (pgmap->range.start != dev_dax->ranges[0].range.start) { + u64 phys = dev_dax->ranges[0].range.start; + u64 pgmap_phys = dev_dax->pgmap[0].range.start; + u64 data_offset = 0; + + if (!WARN_ON(pgmap_phys > phys)) + data_offset = phys - pgmap_phys; + + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", + __func__, phys, pgmap_phys, data_offset); + } + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &fsdev_fops); + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); + if (rc) + return rc; + + run_dax(dax_dev); + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); +} + +static struct dax_device_driver fsdev_dax_driver = { + .probe = fsdev_dax_probe, + .type = DAXDRV_FSDEV_TYPE, +}; + +static int __init dax_init(void) +{ + return dax_driver_register(&fsdev_dax_driver); +} + +static void __exit dax_exit(void) +{ + dax_driver_unregister(&fsdev_dax_driver); +} + +MODULE_AUTHOR("John Groves"); +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); +MODULE_LICENSE("GPL"); +module_init(dax_init); +module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); -- cgit v1.2.3 From 759455848df0b9ac3acabdbedcdc4a55af67935f Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:44 +0000 Subject: dax: Save the kva from memremap Save the kva from memremap because we need it for iomap rw support. Prior to famfs, there were no iomap users of /dev/dax - so the virtual address from memremap was not needed. Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d1d08-dd372cb9-5934-43b8-bef8-089660d04a81-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/dax/fsdev.c') diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 8b5c6976ad17..c75478d3d548 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -121,6 +121,7 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) struct device *dev = &dev_dax->dev; struct dev_pagemap *pgmap; struct inode *inode; + u64 data_offset = 0; struct cdev *cdev; void *addr; int rc, i; @@ -196,7 +197,6 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) if (pgmap->range.start != dev_dax->ranges[0].range.start) { u64 phys = dev_dax->ranges[0].range.start; u64 pgmap_phys = dev_dax->pgmap[0].range.start; - u64 data_offset = 0; if (!WARN_ON(pgmap_phys > phys)) data_offset = phys - pgmap_phys; @@ -204,6 +204,7 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", __func__, phys, pgmap_phys, data_offset); } + dev_dax->virt_addr = addr + data_offset; inode = dax_inode(dax_dev); cdev = inode->i_cdev; -- cgit v1.2.3 From 099c81a1f0ab3e948d73c5ab2b7a3b702af36e64 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:04:54 +0000 Subject: dax: Add dax_operations for use by fs-dax on fsdev dax fsdev: Add dax_operations for use by famfs. This replicates the functionality from drivers/nvdimm/pmem.c that conventional fs-dax file systems (e.g. xfs) use to support dax read/write/mmap to a daxdev - without which famfs can't sit atop a daxdev. - These methods are based on pmem_dax_ops from drivers/nvdimm/pmem.c - fsdev_dax_direct_access() returns the hpa, pfn and kva. The kva was newly stored as dev_dax->virt_addr by dev_dax_probe(). - The hpa/pfn are used for mmap (dax_iomap_fault()), and the kva is used for read/write (dax_iomap_rw()) - fsdev_dax_recovery_write() and dev_dax_zero_page_range() have not been tested yet. I'm looking for suggestions as to how to test those. - dax-private.h: add dev_dax->cached_size, which fsdev needs to remember. The dev_dax size cannot change while a driver is bound (dev_dax_resize returns -EBUSY if dev->driver is set). Caching the size at probe time allows fsdev's direct_access path can use it without acquiring dax_dev_rwsem (which isn't exported anyway). Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d415a-bd6af0fe-5445-484c-9d39-210b8170b686-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'drivers/dax/fsdev.c') diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index c75478d3d548..30f57c74c979 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -28,6 +28,85 @@ * - No mmap support - all access is through fs-dax/iomap */ +static void fsdev_write_dax(void *addr, struct page *page, + unsigned int off, unsigned int len) +{ + while (len) { + void *mem = kmap_local_page(page); + unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); + + memcpy_flushcache(addr, mem + off, chunk); + kunmap_local(mem); + len -= chunk; + off = 0; + page++; + addr += chunk; + } +} + +static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + unsigned long *pfn) +{ + struct dev_dax *dev_dax = dax_get_private(dax_dev); + size_t size = nr_pages << PAGE_SHIFT; + size_t offset = pgoff << PAGE_SHIFT; + void *virt_addr = dev_dax->virt_addr + offset; + phys_addr_t phys; + unsigned long local_pfn; + + phys = dax_pgoff_to_phys(dev_dax, pgoff, size); + if (phys == -1) { + dev_dbg(&dev_dax->dev, + "pgoff (%#lx) out of range\n", pgoff); + return -EFAULT; + } + + if (kaddr) + *kaddr = virt_addr; + + local_pfn = PHYS_PFN(phys); + if (pfn) + *pfn = local_pfn; + + /* + * Use cached_size which was computed at probe time. The size cannot + * change while the driver is bound (resize returns -EBUSY). + */ + return PHYS_PFN(min(size, dev_dax->cached_size - offset)); +} + +static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + void *kaddr; + + WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); + __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); + return 0; +} + +static long fsdev_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, unsigned long *pfn) +{ + return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, + kaddr, pfn); +} + +static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return _copy_from_iter_flushcache(addr, bytes, i); +} + +static const struct dax_operations dev_dax_ops = { + .direct_access = fsdev_dax_direct_access, + .zero_page_range = fsdev_dax_zero_page_range, + .recovery_write = fsdev_dax_recovery_write, +}; + static void fsdev_cdev_del(void *cdev) { cdev_del(cdev); @@ -167,6 +246,11 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) } } + /* Cache size now; it cannot change while driver is bound */ + dev_dax->cached_size = 0; + for (i = 0; i < dev_dax->nr_range; i++) + dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); + /* * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this -- cgit v1.2.3 From 700ecbc1f5aa02ba9ad68d7be1ef7a9c8eae07e9 Mon Sep 17 00:00:00 2001 From: John Groves Date: Fri, 27 Mar 2026 21:05:03 +0000 Subject: dax: Add dax_set_ops() for setting dax_operations at bind time Add a new dax_set_ops() function that allows drivers to set the dax_operations after the dax_device has been allocated. This is needed for fsdev_dax where the operations need to be set during probe and cleared during unbind. The fsdev driver uses devm_add_action_or_reset() for cleanup consistency, avoiding the complexity of mixing devm-managed resources with manual cleanup in a remove() callback. This ensures cleanup happens automatically in the correct reverse order when the device is unbound. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: John Groves Link: https://patch.msgid.link/0100019d311d65a0-b9c1419e-f3a0-4afd-b0bd-848f18ff5950-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/dax/fsdev.c') diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 30f57c74c979..4499d9621f33 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -117,6 +117,13 @@ static void fsdev_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static void fsdev_clear_ops(void *data) +{ + struct dev_dax *dev_dax = data; + + dax_set_ops(dev_dax->dax_dev, NULL); +} + /* * Page map operations for FS-DAX mode * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c @@ -303,6 +310,15 @@ static int fsdev_dax_probe(struct dev_dax *dev_dax) if (rc) return rc; + /* Set the dax operations for fs-dax access path */ + rc = dax_set_ops(dax_dev, &dev_dax_ops); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); + if (rc) + return rc; + run_dax(dax_dev); return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); } -- cgit v1.2.3 From 45df9111692c62d5f09fc4345ae36dae31024797 Mon Sep 17 00:00:00 2001 From: John Groves Date: Sun, 12 Apr 2026 15:50:06 +0000 Subject: dax/fsdev: fix uninitialized kaddr in fsdev_dax_zero_page_range() __fsdev_dax_direct_access() returns -EFAULT without setting *kaddr when dax_pgoff_to_phys() returns -1 (pgoff out of range). The return value was ignored, leaving kaddr uninitialized before being passed to fsdev_write_dax(). Check the return value and propagate the error. Thanks to Dan Carpenter and the smatch project for reporting this. Signed-off-by: John Groves Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/0100019d8262cda2-9714d31c-8fc1-4ca5-b32d-4df678240d14-000000@email.amazonses.com Signed-off-by: Ira Weiny --- drivers/dax/fsdev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/dax/fsdev.c') diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c index 4499d9621f33..188b2526bee4 100644 --- a/drivers/dax/fsdev.c +++ b/drivers/dax/fsdev.c @@ -80,9 +80,12 @@ static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { void *kaddr; + long rc; WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); - __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); + if (rc < 0) + return rc; fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); return 0; } -- cgit v1.2.3