From 1888635532fbbd6be4a4368621085c3a197279f8 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 16:53:15 +0800 Subject: [PATCH 01/16] writeback: Wake up waiting tasks when finishing the writeback of a chunk. Writing back a large number of pages can take a lots of time. This issue is exacerbated when the underlying device is slow or subject to block layer rate limiting, which in turn triggers unexpected hung task warnings. We can trigger a wake-up once a chunk has been written back and the waiting time for writeback exceeds half of sysctl_hung_task_timeout_secs. This action allows the hung task detector to be aware of the writeback progress, thereby eliminating these unexpected hung task warnings. This patch has passed the xfstests 'check -g quick' test based on ext4, with no additional failures introduced. Signed-off-by: Julian Sun Reviewed-by: Jan Kara Suggested-by: Peter Zijlstra Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 10 +++++++++- include/linux/backing-dev-defs.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..61a980a06cee 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include #include #include #include @@ -213,7 +214,8 @@ static void wb_queue_work(struct bdi_writeback *wb, void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, + ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -2014,6 +2016,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..c8aa749790b1 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,7 @@ enum wb_reason { struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; + unsigned long progress_stamp; /* The jiffies when slow progress is detected */ }; #define __WB_COMPLETION_INIT(_waitq) \ From d6e6215907640801b1f407dc9e871b19ca5a3805 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 15:18:29 +0800 Subject: [PATCH 02/16] writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs) When a writeback work lasts for sysctl_hung_task_timeout_secs, we want to identify that there are tasks waiting for a long time-this helps us pinpoint potential issues. Additionally, recording the starting jiffies is useful when debugging a crashed vmcore. Signed-off-by: Julian Sun Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 17 +++++++++++++++-- include/linux/backing-dev-defs.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 61a980a06cee..e76192d140e3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -201,6 +201,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -213,9 +226,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, - ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c8aa749790b1..610ef62b6a32 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -64,6 +64,7 @@ struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ From a21134b5d6cb5e9e11fd170c48eeba6f82f1b2a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:12 +0200 Subject: [PATCH 03/16] mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode Use filemap_fdatawrite_range instead of opencoding the logic using filemap_fdatawrite_wbc. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-2-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- mm/filemap.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..99d6919af60d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4457,16 +4457,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); From 3c2e5cee5eb3e691ff802b15936844739252b016 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:13 +0200 Subject: [PATCH 04/16] 9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close Use filemap_fdatawrite_range instead of opencoding the logic using filemap_fdatawrite_wbc. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/9p/vfs_file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eb0b083da269..612a230bc012 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { - struct inode *inode; - - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_ALL, - .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, - /* absolute end, byte at end included */ - .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + - (vma->vm_end - vma->vm_start - 1), - }; - if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); - inode = file_inode(vma->vm_file); - filemap_fdatawrite_wbc(inode->i_mapping, &wbc); + filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping, + (loff_t)vma->vm_pgoff * PAGE_SIZE, + (loff_t)vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1)); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { From 890f141da068ab38d6dd52e6d109c1f488e980d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:14 +0200 Subject: [PATCH 05/16] ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers Use filemap_fdatawrite_range instead of opencoding the logic using filemap_fdatawrite_wbc. There is a slight change in the conversion as nr_to_write is now set to LONG_MAX instead of double the number of the pages in the range. LONG_MAX is the usual nr_to_write for WB_SYNC_ALL writeback, and the value expected by lower layers here. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-4-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Joseph Qi Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/ocfs2/journal.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5f58ff2175f..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,15 +902,8 @@ int ocfs2_journal_alloc(struct ocfs2_super *osb) static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) From 41e52c644753f1faac8892df32709cc659be01fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:15 +0200 Subject: [PATCH 06/16] btrfs: use the local tmp_inode variable in start_delalloc_inodes start_delalloc_inodes has a struct inode * pointer available in the main loop, use it instead of re-calculating it from the btrfs inode. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-5-hch@lst.de Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..9edb78fc57fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8744,9 +8744,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8754,7 +8754,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping, wbc); btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; From c9501112e3cbb7bd9941024f005c5d0f0b77c9f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:16 +0200 Subject: [PATCH 07/16] btrfs: push struct writeback_control into start_delalloc_inodes In preparation for changing the filemap_fdatawrite_wbc API to not expose the writeback_control to the callers, push the wbc declaration next to the filemap_fdatawrite_wbc call and just pass the nr_to_write value to start_delalloc_inodes. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-6-hch@lst.de Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 51 ++++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9edb78fc57fc..b97d6c1f7772 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8709,15 +8709,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8743,7 +8741,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { + if (nr_to_write == NULL) { work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { iput(tmp_inode); @@ -8754,9 +8752,20 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping, wbc); + struct writeback_control wbc = { + .nr_to_write = *nr_to_write, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping, + &wbc); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (*nr_to_write != LONG_MAX) + *nr_to_write = wbc.nr_to_write; + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8782,29 +8791,17 @@ static int start_delalloc_inodes(struct btrfs_root *root, int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8816,13 +8813,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8831,9 +8821,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } From 7fabcb7fbabbcddd9dc42dbe4c92d18ce3e54283 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:17 +0200 Subject: [PATCH 08/16] mm,btrfs: add a filemap_flush_nr helper Abstract out the btrfs-specific behavior of kicking off I/O on a number of pages on an address_space into a well-defined helper. Note: there is no kerneldoc comment for the new function because it is not part of the public API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-7-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 13 ++----------- include/linux/pagemap.h | 1 + mm/filemap.c | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b97d6c1f7772..d12b8116adde 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8752,19 +8752,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - struct writeback_control wbc = { - .nr_to_write = *nr_to_write, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; - - ret = filemap_fdatawrite_wbc(tmp_inode->i_mapping, - &wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (*nr_to_write != LONG_MAX) - *nr_to_write = wbc.nr_to_write; if (ret || *nr_to_write <= 0) goto out; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..cebdf160d3dd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping, int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 99d6919af60d..e344b79a012d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -474,6 +474,28 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + struct writeback_control wbc = { + .nr_to_write = *nr_to_write, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; + int ret; + + ret = filemap_fdatawrite_wbc(mapping, &wbc); + if (!ret) + *nr_to_write = wbc.nr_to_write; + return ret; +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check From 7359651448062ef07d5190e11609722a59a90998 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:18 +0200 Subject: [PATCH 09/16] mm: remove __filemap_fdatawrite And rewrite filemap_fdatawrite to use filemap_fdatawrite_range instead to have a simpler call chain. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-8-hch@lst.de Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- mm/filemap.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e344b79a012d..3d4c4a96c586 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -422,25 +422,19 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, return filemap_fdatawrite_wbc(mapping, &wbc); } -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) -{ - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); -} - -int filemap_fdatawrite(struct address_space *mapping) -{ - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite); - int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) + loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); +int filemap_fdatawrite(struct address_space *mapping) +{ + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); +} +EXPORT_SYMBOL(filemap_fdatawrite); + /** * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space @@ -470,7 +464,7 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_fdatawrite_range_kick(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); From 1bcb413d0cd80efb386751910036a93147fd8dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:19 +0200 Subject: [PATCH 10/16] mm: remove filemap_fdatawrite_wbc Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the callers with a filemap_writeback helper that takes all the possible arguments and declares the writeback_control itself. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 6 ++--- include/linux/pagemap.h | 2 -- mm/filemap.c | 54 ++++++++++++++--------------------------- 3 files changed, 21 insertions(+), 41 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e76192d140e3..4448de35ec8b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -822,9 +822,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cebdf160d3dd..678d8ae23d01 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -60,8 +60,6 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) diff --git a/mm/filemap.c b/mm/filemap.c index 3d4c4a96c586..7126d0587c94 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -366,31 +366,30 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range @@ -412,14 +411,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_wbc); int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_writeback(mapping, start, end, sync_mode, NULL); } int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, @@ -475,18 +467,8 @@ EXPORT_SYMBOL(filemap_flush); */ int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) { - struct writeback_control wbc = { - .nr_to_write = *nr_to_write, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; - int ret; - - ret = filemap_fdatawrite_wbc(mapping, &wbc); - if (!ret) - *nr_to_write = wbc.nr_to_write; - return ret; + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); } EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); From 45cbce5b8877f339b72548f60aa97634044c255c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:20 +0200 Subject: [PATCH 11/16] mm: remove __filemap_fdatawrite_range Use filemap_fdatawrite_range and filemap_fdatawrite_range_kick instead of the low-level __filemap_fdatawrite_range that requires the caller to know the internals of the writeback_control structure and remove __filemap_fdatawrite_range now that it is trivial and only two callers would be left. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-10-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/sync.c | 11 +++++------ include/linux/pagemap.h | 2 -- mm/fadvise.c | 3 +-- mm/filemap.c | 25 +++++++------------------ 4 files changed, 13 insertions(+), 28 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..6d8b04e04c3c 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -280,14 +280,13 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_fdatawrite_range_kick(mapping, offset, + endbyte); if (ret < 0) goto out; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 678d8ae23d01..d0a7dd43c835 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -54,8 +54,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); diff --git a/mm/fadvise.c b/mm/fadvise.c index 588fe76c5a14..f1be619f0e58 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + filemap_fdatawrite_range_kick(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 7126d0587c94..f90f5bb2b825 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -392,32 +392,23 @@ static int filemap_writeback(struct address_space *mapping, loff_t start, } /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - return filemap_writeback(mapping, start, end, sync_mode, NULL); -} - int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } EXPORT_SYMBOL(filemap_fdatawrite_range); @@ -441,7 +432,7 @@ EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); @@ -689,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -792,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); From c28d67b33cbf6da2043ee7517f1aa4cbf92dbbba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:21 +0200 Subject: [PATCH 12/16] mm: rename filemap_fdatawrite_range_kick to filemap_flush_range Rename filemap_fdatawrite_range_kick to filemap_flush_range because it is the ranged version of filemap_flush. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-11-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/sync.c | 3 +-- include/linux/fs.h | 6 +++--- mm/fadvise.c | 2 +- mm/filemap.c | 8 ++++---- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 6d8b04e04c3c..1759f6ba36cd 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -285,8 +285,7 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, ret = filemap_fdatawrite_range(mapping, offset, endbyte); else - ret = filemap_fdatawrite_range_kick(mapping, offset, - endbyte); + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..a5dbfa20f8d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3014,7 +3014,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) @@ -3051,8 +3051,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_flush_range(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; diff --git a/mm/fadvise.c b/mm/fadvise.c index f1be619f0e58..67028e30aa91 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,7 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - filemap_fdatawrite_range_kick(mapping, offset, endbyte); + filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index f90f5bb2b825..fa770768ea3a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -419,7 +419,7 @@ int filemap_fdatawrite(struct address_space *mapping) EXPORT_SYMBOL(filemap_fdatawrite); /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -429,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -447,7 +447,7 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return filemap_fdatawrite_range_kick(mapping, 0, LLONG_MAX); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); From 151d0922bf638a4e4235758d04b31f48bfcbb798 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:47 +0200 Subject: [PATCH 13/16] writeback: cleanup writeback_chunk_size Return the pages directly when calculated instead of first assigning them back to a variable, and directly return for the data integrity / tagged case instead of going through an else clause. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-2-hch@lst.de Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Reviewed-by: Nirjhar Roy (IBM) Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4448de35ec8b..30de37865fa1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1908,16 +1908,12 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } /* From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:48 +0200 Subject: [PATCH 14/16] writeback: allow the file system to override MIN_WRITEBACK_PAGES The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- fs/super.c | 1 + include/linux/fs.h | 1 + include/linux/writeback.h | 5 +++++ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30de37865fa1..52763fa499d6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -32,11 +32,6 @@ #include #include "internal.h" -/* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -1889,8 +1884,8 @@ static int writeback_single_inode(struct inode *inode, return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1913,7 +1908,8 @@ static long writeback_chunk_size(struct bdi_writeback *wb, pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); - return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -2015,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..599c1d2641fe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: diff --git a/include/linux/fs.h b/include/linux/fs.h index a5dbfa20f8d7..6bf369095d2e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1583,6 +1583,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..49e1dd96f43e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ From 015a54407782309c11655e1938bcbcef131ebf87 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:49 +0200 Subject: [PATCH 15/16] xfs: set s_min_writeback_pages for zoned file systems Set s_min_writeback_pages to the zone size, so that writeback always writes up to a full zone. This ensures that writeback does not add spurious file fragmentation when writing back a large number of files that are larger than the zone size. Fixes: 4e4d52075577 ("xfs: add the zoned space allocator") Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-4-hch@lst.de Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_zone_alloc.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1147bacb2da8..c342595acc3e 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1215,6 +1215,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1245,10 +1246,33 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { error = blkdev_report_zones(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), From 4952f35f0545f3b53dab8d5fd727c4827c2a2778 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Mon, 29 Sep 2025 19:13:49 +0800 Subject: [PATCH 16/16] fs: Make wbc_to_tag() inline and use it in fs. The logic in wbc_to_tag() is widely used in file systems, so modify this function to be inline and use it in file systems. This patch has only passed compilation tests, but it should be fine. Signed-off-by: Julian Sun Reviewed-by: Qu Wenruo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/extent_io.c | 5 +---- fs/ceph/addr.c | 6 +----- fs/ext4/inode.c | 5 +---- fs/f2fs/data.c | 5 +---- fs/gfs2/aops.c | 5 +---- include/linux/writeback.h | 7 +++++++ mm/page-writeback.c | 6 ------ 7 files changed, 12 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..170dd7e80d11 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2460,10 +2460,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9e4ac87211e..58d6194045e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2619,10 +2619,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ef38e62cda8f..826bcfb8230c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..12394fc5dd29 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 49e1dd96f43e..2a81816f7507 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -196,6 +196,13 @@ static inline void wait_on_inode(struct inode *inode) !(READ_ONCE(inode->i_state) & I_NEW)); } +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + #ifdef CONFIG_CGROUP_WRITEBACK #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..a124ab6a205d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) {