From a0f1cabe294c914ef58a414dce0de5c46a767bb5 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:50 -0800 Subject: [PATCH 1/8] iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted The naming "bytes_pending" and "bytes_accounted" may be confusing and could be better named. Rename this to "bytes_submitted" and "bytes_not_submitted" to make it more clear that these are bytes we passed to the IO helper to read in. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-2-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6ae031ac8058..7dcb8bbc9484 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -394,16 +394,16 @@ static void iomap_read_init(struct folio *folio) * Else the IO helper will end the read after all submitted ranges have been * read. */ -static void iomap_read_end(struct folio *folio, size_t bytes_pending) +static void iomap_read_end(struct folio *folio, size_t bytes_submitted) { struct iomap_folio_state *ifs; /* - * If there are no bytes pending, this means we are responsible for + * If there are no bytes submitted, this means we are responsible for * unlocking the folio here, since no IO helper has taken ownership of * it. */ - if (!bytes_pending) { + if (!bytes_submitted) { folio_unlock(folio); return; } @@ -416,11 +416,11 @@ static void iomap_read_end(struct folio *folio, size_t bytes_pending) * read_bytes_pending but skipped for IO. * The +1 accounts for the bias we added in iomap_read_init(). */ - size_t bytes_accounted = folio_size(folio) + 1 - - bytes_pending; + size_t bytes_not_submitted = folio_size(folio) + 1 - + bytes_submitted; spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending -= bytes_accounted; + ifs->read_bytes_pending -= bytes_not_submitted; /* * If !ifs->read_bytes_pending, this means all pending reads * by the IO helper have already completed, which means we need @@ -437,7 +437,7 @@ static void iomap_read_end(struct folio *folio, size_t bytes_pending) } static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx, size_t *bytes_pending) + struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; @@ -478,9 +478,9 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, folio_zero_range(folio, poff, plen); iomap_set_range_uptodate(folio, poff, plen); } else { - if (!*bytes_pending) + if (!*bytes_submitted) iomap_read_init(folio); - *bytes_pending += plen; + *bytes_submitted += plen; ret = ctx->ops->read_folio_range(iter, ctx, plen); if (ret) return ret; @@ -504,39 +504,40 @@ void iomap_read_folio(const struct iomap_ops *ops, .pos = folio_pos(folio), .len = folio_size(folio), }; - size_t bytes_pending = 0; + size_t bytes_submitted = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, ctx, &bytes_pending); + iter.status = iomap_read_folio_iter(&iter, ctx, + &bytes_submitted); if (ctx->ops->submit_read) ctx->ops->submit_read(ctx); - iomap_read_end(folio, bytes_pending); + iomap_read_end(folio, bytes_submitted); } EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_pending) + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { - iomap_read_end(ctx->cur_folio, *cur_bytes_pending); + iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); if (WARN_ON_ONCE(!ctx->cur_folio)) return -EINVAL; - *cur_bytes_pending = 0; + *cur_bytes_submitted = 0; } - ret = iomap_read_folio_iter(iter, ctx, cur_bytes_pending); + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); if (ret) return ret; } @@ -568,19 +569,19 @@ void iomap_readahead(const struct iomap_ops *ops, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - size_t cur_bytes_pending; + size_t cur_bytes_submitted; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) iter.status = iomap_readahead_iter(&iter, ctx, - &cur_bytes_pending); + &cur_bytes_submitted); if (ctx->ops->submit_read) ctx->ops->submit_read(ctx); if (ctx->cur_folio) - iomap_read_end(ctx->cur_folio, cur_bytes_pending); + iomap_read_end(ctx->cur_folio, cur_bytes_submitted); } EXPORT_SYMBOL_GPL(iomap_readahead); From 9d875e0eef8ec15b6b1da0cb9a0f8ed13efee89e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:51 -0800 Subject: [PATCH 2/8] iomap: account for unaligned end offsets when truncating read range The end position to start truncating from may be at an offset into a block, which under the current logic would result in overtruncation. Adjust the calculation to account for unaligned end offsets. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-3-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7dcb8bbc9484..0eb439b523b1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -218,6 +218,22 @@ static void ifs_free(struct folio *folio) kfree(ifs); } +/* + * Calculate how many bytes to truncate based off the number of blocks to + * truncate and the end position to start truncating from. + */ +static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, + unsigned blocks_truncated) +{ + unsigned block_size = 1 << block_bits; + unsigned block_offset = end_pos & (block_size - 1); + + if (!block_offset) + return blocks_truncated << block_bits; + + return ((blocks_truncated - 1) << block_bits) + block_offset; +} + /* * Calculate the range inside the folio that we actually need to read. */ @@ -263,7 +279,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, /* truncate len if we find any trailing uptodate block(s) */ while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { - plen -= (last - i + 1) * block_size; + plen -= iomap_bytes_to_truncate(*pos + plen, + block_bits, last - i + 1); last = i - 1; break; } @@ -279,7 +296,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) - plen -= (last - end) * block_size; + plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, + last - end); } *offp = poff; From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:52 -0800 Subject: [PATCH 3/8] docs: document iomap writeback's iomap_finish_folio_write() requirement Document that iomap_finish_folio_write() must be called after writeback on the range completes. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 3 +++ include/linux/iomap.h | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index c88205132039..4d30723be7fa 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -361,6 +361,9 @@ The fields are as follows: delalloc reservations to avoid having delalloc reservations for clean pagecache. This function must be supplied by the filesystem. + If this succeeds, iomap_finish_folio_write() must be called once writeback + completes for the range, regardless of whether the writeback succeeded or + failed. - ``writeback_submit``: Submit the previous built writeback context. Block based file systems should use the iomap_ioend_writeback_submit diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..a5032e456079 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -435,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:53 -0800 Subject: [PATCH 4/8] iomap: optimize pending async writeback accounting Pending writebacks must be accounted for to determine when all requests have completed and writeback on the folio should be ended. Currently this is done by atomically incrementing ifs->write_bytes_pending for every range to be written back. Instead, the number of atomic operations can be minimized by setting ifs->write_bytes_pending to the folio size, internally tracking how many bytes are written back asynchronously, and then after sending off all the requests, decrementing ifs->write_bytes_pending by the number of bytes not written back asynchronously. Now, for N ranges written back, only N + 2 atomic operations are required instead of 2N + 2. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fuse/file.c | 4 +-- fs/iomap/buffered-io.c | 58 +++++++++++++++++++++++++----------------- fs/iomap/ioend.c | 2 -- include/linux/iomap.h | 2 -- 4 files changed, 36 insertions(+), 30 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8275b6681b9b..b343a6f37563 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1885,7 +1885,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2221,7 +2222,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0eb439b523b1..1873a2f74883 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1641,16 +1641,25 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1667,7 +1676,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1681,11 +1690,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1756,7 +1765,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1776,14 +1785,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1798,13 +1800,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1822,12 +1824,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5032e456079..b49e47f069db 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:54 -0800 Subject: [PATCH 5/8] iomap: simplify ->read_folio_range() error handling for reads Instead of requiring that the caller calls iomap_finish_folio_read() even if the ->read_folio_range() callback returns an error, account for this internally in iomap instead, which makes the interface simpler and makes it match writeback's ->read_folio_range() error handling expectations. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- .../filesystems/iomap/operations.rst | 7 +-- fs/fuse/file.c | 10 +-- fs/iomap/buffered-io.c | 63 ++++++++++--------- include/linux/iomap.h | 5 +- 4 files changed, 41 insertions(+), 44 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 4d30723be7fa..64f4baf5750e 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -149,10 +149,9 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: iomap calls these functions: - ``read_folio_range``: Called to read in the range. This must be provided - by the caller. The caller is responsible for calling - iomap_finish_folio_read() after reading in the folio range. This should be - done even if an error is encountered during the read. This returns 0 on - success or a negative error on failure. + by the caller. If this succeeds, iomap_finish_folio_read() must be called + after the range is read in, regardless of whether the read succeeded or + failed. - ``submit_read``: Submit any pending read requests. This function is optional. diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b343a6f37563..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -922,13 +922,6 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, if (ctx->rac) { ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); - /* - * If fuse_handle_readahead was successful, fuse_readpages_end - * will do the iomap_finish_folio_read, else we need to call it - * here - */ - if (ret) - iomap_finish_folio_read(folio, off, len, ret); } else { /* * for non-readahead read requests, do reads synchronously @@ -936,7 +929,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, * out-of-order reads */ ret = fuse_do_readfolio(file, folio, off, len); - iomap_finish_folio_read(folio, off, len, ret); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); } return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1873a2f74883..c82b5b24d4b3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -398,7 +398,8 @@ static void iomap_read_init(struct folio *folio) * has already finished reading in the entire folio. */ spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += len + 1; + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; spin_unlock_irq(&ifs->state_lock); } } @@ -414,43 +415,47 @@ static void iomap_read_init(struct folio *folio) */ static void iomap_read_end(struct folio *folio, size_t bytes_submitted) { - struct iomap_folio_state *ifs; + struct iomap_folio_state *ifs = folio->private; - /* - * If there are no bytes submitted, this means we are responsible for - * unlocking the folio here, since no IO helper has taken ownership of - * it. - */ - if (!bytes_submitted) { - folio_unlock(folio); - return; - } - - ifs = folio->private; if (ifs) { bool end_read, uptodate; - /* - * Subtract any bytes that were initially accounted to - * read_bytes_pending but skipped for IO. - * The +1 accounts for the bias we added in iomap_read_init(). - */ - size_t bytes_not_submitted = folio_size(folio) + 1 - - bytes_submitted; spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending -= bytes_not_submitted; - /* - * If !ifs->read_bytes_pending, this means all pending reads - * by the IO helper have already completed, which means we need - * to end the folio read here. If ifs->read_bytes_pending != 0, - * the IO helper will end the folio read. - */ - end_read = !ifs->read_bytes_pending; + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + end_read = true; + } else { + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 + * accounts for the bias we added in iomap_read_init(). + */ + size_t bytes_not_submitted = folio_size(folio) + 1 - + bytes_submitted; + ifs->read_bytes_pending -= bytes_not_submitted; + /* + * If !ifs->read_bytes_pending, this means all pending + * reads by the IO helper have already completed, which + * means we need to end the folio read here. If + * ifs->read_bytes_pending != 0, the IO helper will end + * the folio read. + */ + end_read = !ifs->read_bytes_pending; + } if (end_read) uptodate = ifs_is_fully_uptodate(folio, ifs); spin_unlock_irq(&ifs->state_lock); if (end_read) folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); } } @@ -498,10 +503,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, } else { if (!*bytes_submitted) iomap_read_init(folio); - *bytes_submitted += plen; ret = ctx->ops->read_folio_range(iter, ctx, plen); if (ret) return ret; + *bytes_submitted += plen; } ret = iomap_iter_advance(iter, plen); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b49e47f069db..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -495,9 +495,8 @@ struct iomap_read_ops { /* * Read in a folio range. * - * The caller is responsible for calling iomap_finish_folio_read() after - * reading in the folio range. This should be done even if an error is - * encountered during the read. + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ From a298febc47e0ce116b9fc8151337ba8b2137e42d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:55 -0800 Subject: [PATCH 6/8] iomap: simplify when reads can be skipped for writes Currently, the logic for skipping the read range for a write is if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) which breaks down to skipping the read if any of these are true: a) from <= poff && to <= poff b) from <= poff && to >= poff + plen c) from >= poff + plen && to <= poff d) from >= poff + plen && to >= poff + plen This can be simplified to if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && to >= poff + plen) from the following reasoning: a) from <= poff && to <= poff This reduces to 'to <= poff' since it is guaranteed that 'from <= to' (since to = from + len). It is not possible for 'from <= to' to be true here because we only reach here if plen > 0 (thanks to the preceding 'if (plen == 0)' check that would break us out of the loop). If 'to <= poff', plen would have to be 0 since poff and plen get adjusted in lockstep for uptodate blocks. This means we can eliminate this check. c) from >= poff + plen && to <= poff This is not possible since 'from <= to' and 'plen > 0'. We can eliminate this check. d) from >= poff + plen && to >= poff + plen This reduces to 'from >= poff + plen' since 'from <= to'. It is not possible for 'from >= poff + plen' to be true here. We only reach here if plen > 0 and for writes, poff and plen will always be block-aligned, which means poff <= from < poff + plen. We can eliminate this check. The only valid check is b) from <= poff && to >= poff + plen. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-7-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c82b5b24d4b3..17449ea13420 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -758,9 +758,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, if (plen == 0) break; - if (!(iter->flags & IOMAP_UNSHARE) && - (from <= poff || from >= poff + plen) && - (to <= poff || to >= poff + plen)) + /* + * If the read range will be entirely overwritten by the write, + * we can skip having to zero/read it in. + */ + if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && + to >= poff + plen) continue; if (iomap_block_needs_zeroing(iter, block_start)) { From fed9c62d28b726dad70cc03fd28ffd700b59c741 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:57 -0800 Subject: [PATCH 7/8] iomap: use find_next_bit() for dirty bitmap scanning Use find_next_bit()/find_next_zero_bit() for iomap dirty bitmap scanning. This uses __ffs() internally and is more efficient for finding the next dirty or clean bit than iterating through the bitmap range testing every bit. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-9-joannelkoong@gmail.com Reviewed-by: Darrick J. Wong Suggested-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 61 ++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 17449ea13420..3713ced188ab 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -76,13 +76,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } -static inline bool ifs_block_is_dirty(struct folio *folio, - struct iomap_folio_state *ifs, int block) +/* + * Find the next dirty block in the folio. end_blk is inclusive. + * If no dirty block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_dirty_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int blks = i_blocks_per_folio(inode, folio); - return test_bit(block + blks_per_folio, ifs->state); + return find_next_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; +} + +/* + * Find the next clean block in the folio. end_blk is inclusive. + * If no clean block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_clean_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned int blks = i_blocks_per_folio(inode, folio); + + return find_next_zero_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; } static unsigned ifs_find_dirty_range(struct folio *folio, @@ -93,18 +114,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio, offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, - i_blocks_per_folio(inode, folio)); - unsigned nblks = 1; + i_blocks_per_folio(inode, folio)) - 1; + unsigned nblks; - while (!ifs_block_is_dirty(folio, ifs, start_blk)) - if (++start_blk == end_blk) - return 0; - - while (start_blk + nblks < end_blk) { - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) - break; - nblks++; - } + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + return 0; + if (start_blk == end_blk) + nblks = 1; + else + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - + start_blk; *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; @@ -1166,7 +1186,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { - unsigned int first_blk, last_blk, i; + unsigned int first_blk, last_blk; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; @@ -1185,10 +1205,11 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; - for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) - punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits, iomap); + while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) + <= last_blk) { + punch(inode, folio_pos(folio) + (first_blk << blkbits), + 1 << blkbits, iomap); + first_blk++; } } From b56c1c54f225ca02d88ec562f017be23429bf5b2 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:58 -0800 Subject: [PATCH 8/8] iomap: use find_next_bit() for uptodate bitmap scanning Use find_next_bit()/find_next_zero_bit() for iomap uptodate bitmap scanning. This uses __ffs() internally and is more efficient for finding the next uptodate or non-uptodate bit than iterating through the the bitmap range testing every bit. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-10-joannelkoong@gmail.com Reviewed-by: Darrick J. Wong Suggested-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3713ced188ab..089566a36cff 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -38,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio, return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, - unsigned int block) +/* + * Find the next uptodate block in the folio. end_blk is inclusive. + * If no uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_uptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { - return test_bit(block, ifs->state); + struct iomap_folio_state *ifs = folio->private; + + return find_next_bit(ifs->state, end_blk + 1, start_blk); +} + +/* + * Find the next non-uptodate block in the folio. end_blk is inclusive. + * If no non-uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_nonuptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); } static bool ifs_set_range_uptodate(struct folio *folio, @@ -277,14 +295,11 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * to avoid reading in already uptodate ranges. */ if (ifs) { - unsigned int i, blocks_skipped; + unsigned int next, blocks_skipped; - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - break; + next = ifs_next_nonuptodate_block(folio, first, last); + blocks_skipped = next - first; - blocks_skipped = i - first; if (blocks_skipped) { unsigned long block_offset = *pos & (block_size - 1); unsigned bytes_skipped = @@ -294,15 +309,15 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, poff += bytes_skipped; plen -= bytes_skipped; } - first = i; + first = next; /* truncate len if we find any trailing uptodate block(s) */ - while (++i <= last) { - if (ifs_block_is_uptodate(ifs, i)) { + if (++next <= last) { + next = ifs_next_uptodate_block(folio, next, last); + if (next <= last) { plen -= iomap_bytes_to_truncate(*pos + plen, - block_bits, last - i + 1); - last = i - 1; - break; + block_bits, last - next + 1); + last = next - 1; } } } @@ -639,7 +654,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned first, last, i; + unsigned first, last; if (!ifs) return false; @@ -651,10 +666,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - return false; - return true; + return ifs_next_nonuptodate_block(folio, first, last) > last; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);