linux/fs/ext4/move_extent.c

// SPDX-License-Identifier: LGPL-2.1
/*
 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
 *            Akira Fujita <a-fujita@rs.jp.nec.com>
 */

#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"

#include <trace/events/ext4.h>

struct mext_data {
	struct inode *orig_inode;	/* Origin file inode */
	struct inode *donor_inode;	/* Donor file inode */
	struct ext4_map_blocks orig_map;/* Origin file's move mapping */
	ext4_lblk_t donor_lblk;		/* Start block of the donor file */
};

/**
 * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
 * @first: inode to be locked
 * @second: inode to be locked
 *
 * Acquire write lock of i_data_sem of the two inodes
 */
void
ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
	if (first < second) {
		down_write(&EXT4_I(first)->i_data_sem);
		down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
	} else {
		down_write(&EXT4_I(second)->i_data_sem);
		down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
	}
}

/**
 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
 *
 * @orig_inode:		original inode structure to be released its lock first
 * @donor_inode:	donor inode structure to be released its lock second
 * Release write lock of i_data_sem of two inodes (orig and donor).
 */
void
ext4_double_up_write_data_sem(struct inode *orig_inode,
			      struct inode *donor_inode)
{
	up_write(&EXT4_I(orig_inode)->i_data_sem);
	up_write(&EXT4_I(donor_inode)->i_data_sem);
}

/* Grab and lock folio on both @inode1 and @inode2 by inode order. */
static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
				  pgoff_t index1, pgoff_t index2, size_t len,
				  struct folio *folio[2])
{
	struct address_space *mapping[2];
	unsigned int flags;
	fgf_t fgp_flags = FGP_WRITEBEGIN;

	BUG_ON(!inode1 || !inode2);
	if (inode1 < inode2) {
		mapping[0] = inode1->i_mapping;
		mapping[1] = inode2->i_mapping;
	} else {
		swap(index1, index2);
		mapping[0] = inode2->i_mapping;
		mapping[1] = inode1->i_mapping;
	}

	flags = memalloc_nofs_save();
	fgp_flags |= fgf_set_order(len);
	folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags,
			mapping_gfp_mask(mapping[0]));
	if (IS_ERR(folio[0])) {
		memalloc_nofs_restore(flags);
		return PTR_ERR(folio[0]);
	}

	folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags,
			mapping_gfp_mask(mapping[1]));
	memalloc_nofs_restore(flags);
	if (IS_ERR(folio[1])) {
		folio_unlock(folio[0]);
		folio_put(folio[0]);
		return PTR_ERR(folio[1]);
	}
	/*
	 * __filemap_get_folio() may not wait on folio's writeback if
	 * BDI not demand that. But it is reasonable to be very conservative
	 * here and explicitly wait on folio's writeback
	 */
	folio_wait_writeback(folio[0]);
	folio_wait_writeback(folio[1]);
	if (inode1 > inode2)
		swap(folio[0], folio[1]);

	return 0;
}

static void mext_folio_double_unlock(struct folio *folio[2])
{
	folio_unlock(folio[0]);
	folio_put(folio[0]);
	folio_unlock(folio[1]);
	folio_put(folio[1]);
}

/* Force folio buffers uptodate w/o dropping folio's lock */
static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to)
{
	struct inode *inode = folio->mapping->host;
	sector_t block;
	struct buffer_head *bh, *head;
	unsigned int blocksize, block_start, block_end;
	int nr = 0;
	bool partial = false;

	BUG_ON(!folio_test_locked(folio));
	BUG_ON(folio_test_writeback(folio));

	if (folio_test_uptodate(folio))
		return 0;

	blocksize = i_blocksize(inode);
	head = folio_buffers(folio);
	if (!head)
		head = create_empty_buffers(folio, blocksize, 0);

	block = folio_pos(folio) >> inode->i_blkbits;
	block_end = 0;
	bh = head;
	do {
		block_start = block_end;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = true;
			continue;
		}
		if (buffer_uptodate(bh))
			continue;
		if (!buffer_mapped(bh)) {
			int err = ext4_get_block(inode, block, bh, 0);
			if (err)
				return err;
			if (!buffer_mapped(bh)) {
				folio_zero_range(folio, block_start, blocksize);
				set_buffer_uptodate(bh);
				continue;
			}
		}
		lock_buffer(bh);
		if (buffer_uptodate(bh)) {
			unlock_buffer(bh);
			continue;
		}
		ext4_read_bh_nowait(bh, 0, NULL, false);
		nr++;
	} while (block++, (bh = bh->b_this_page) != head);

	/* No io required */
	if (!nr)
		goto out;

	bh = head;
	do {
		if (bh_offset(bh) + blocksize <= from)
			continue;
		if (bh_offset(bh) >= to)
			break;
		wait_on_buffer(bh);
		if (buffer_uptodate(bh))
			continue;
		return -EIO;
	} while ((bh = bh->b_this_page) != head);
out:
	if (!partial)
		folio_mark_uptodate(folio);
	return 0;
}

enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA};

/*
 * Start to move extent between the origin inode and the donor inode,
 * hold one folio for each inode and check the candidate moving extent
 * mapping status again.
 */
static int mext_move_begin(struct mext_data *mext, struct folio *folio[2],
			   enum mext_move_type *move_type)
{
	struct inode *orig_inode = mext->orig_inode;
	struct inode *donor_inode = mext->donor_inode;
	unsigned int blkbits = orig_inode->i_blkbits;
	struct ext4_map_blocks donor_map = {0};
	loff_t orig_pos, donor_pos;
	size_t move_len;
	int ret;

	orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits;
	donor_pos = ((loff_t)mext->donor_lblk) << blkbits;
	ret = mext_folio_double_lock(orig_inode, donor_inode,
			orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT,
			((size_t)mext->orig_map.m_len) << blkbits, folio);
	if (ret)
		return ret;

	/*
	 * Check the origin inode's mapping information again under the
	 * folio lock, as we do not hold the i_data_sem at all times, and
	 * it may change during the concurrent write-back operation.
	 */
	if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) {
		ret = -ESTALE;
		goto error;
	}

	/* Adjust the moving length according to the length of shorter folio. */
	move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos,
			folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos);
	move_len >>= blkbits;
	if (move_len < mext->orig_map.m_len)
		mext->orig_map.m_len = move_len;

	donor_map.m_lblk = mext->donor_lblk;
	donor_map.m_len = mext->orig_map.m_len;
	donor_map.m_flags = 0;
	ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0);
	if (ret < 0)
		goto error;

	/* Adjust the moving length according to the donor mapping length. */
	mext->orig_map.m_len = donor_map.m_len;

	/* Skip moving if the donor range is a hole or a delalloc extent. */
	if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)))
		*move_type = MEXT_SKIP_EXTENT;
	/* If both mapping ranges are unwritten, no need to copy data. */
	else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) &&
		 (donor_map.m_flags & EXT4_MAP_UNWRITTEN))
		*move_type = MEXT_MOVE_EXTENT;
	else
		*move_type = MEXT_COPY_DATA;

	return 0;
error:
	mext_folio_double_unlock(folio);
	return ret;
}

/*
 * Re-create the new moved mapping buffers of the original inode and commit
 * the entire written range.
 */
static int mext_folio_mkwrite(struct inode *inode, struct folio *folio,
			      size_t from, size_t to)
{
	unsigned int blocksize = i_blocksize(inode);
	struct buffer_head *bh, *head;
	size_t block_start, block_end;
	sector_t block;
	int ret;

	head = folio_buffers(folio);
	if (!head)
		head = create_empty_buffers(folio, blocksize, 0);

	block = folio_pos(folio) >> inode->i_blkbits;
	block_end = 0;
	bh = head;
	do {
		block_start = block_end;
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to)
			continue;

		ret = ext4_get_block(inode, block, bh, 0);
		if (ret)
			return ret;
	} while (block++, (bh = bh->b_this_page) != head);

	block_commit_write(folio, from, to);
	return 0;
}

/*
 * Save the data in original inode extent blocks and replace one folio size
 * aligned original inode extent with one or one partial donor inode extent,
 * and then write out the saved data in new original inode blocks. Pass out
 * the replaced block count through m_len. Return 0 on success, and an error
 * code otherwise.
 */
static int mext_move_extent(struct mext_data *mext, u64 *m_len)
{
	struct inode *orig_inode = mext->orig_inode;
	struct inode *donor_inode = mext->donor_inode;
	struct ext4_map_blocks *orig_map = &mext->orig_map;
	unsigned int blkbits = orig_inode->i_blkbits;
	struct folio *folio[2] = {NULL, NULL};
	loff_t from, length;
	enum mext_move_type move_type = 0;
	handle_t *handle;
	u64 r_len = 0;
	unsigned int credits;
	int ret, ret2;

	*m_len = 0;
	trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode,
				     mext->donor_lblk);
	credits = ext4_chunk_trans_extent(orig_inode, 0) * 2;
	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits);
	if (IS_ERR(handle)) {
		ret = PTR_ERR(handle);
		goto out;
	}

	ret = mext_move_begin(mext, folio, &move_type);
	if (ret)
		goto stop_handle;

	if (move_type == MEXT_SKIP_EXTENT)
		goto unlock;

	/*
	 * Copy the data. First, read the original inode data into the page
	 * cache. Then, release the existing mapping relationships and swap
	 * the extent. Finally, re-establish the new mapping relationships
	 * and dirty the page cache.
	 */
	if (move_type == MEXT_COPY_DATA) {
		from = offset_in_folio(folio[0],
				((loff_t)orig_map->m_lblk) << blkbits);
		length = ((loff_t)orig_map->m_len) << blkbits;

		ret = mext_folio_mkuptodate(folio[0], from, from + length);
		if (ret)
			goto unlock;
	}

	if (!filemap_release_folio(folio[0], 0) ||
	    !filemap_release_folio(folio[1], 0)) {
		ret = -EBUSY;
		goto unlock;
	}

	/* Move extent */
	ext4_double_down_write_data_sem(orig_inode, donor_inode);
	*m_len = ext4_swap_extents(handle, orig_inode, donor_inode,
				   orig_map->m_lblk, mext->donor_lblk,
				   orig_map->m_len, 1, &ret);
	ext4_double_up_write_data_sem(orig_inode, donor_inode);

	/* A short-length swap cannot occur after a successful swap extent. */
	if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len)))
		ret = -EIO;

	if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT))
		goto unlock;

	/* Copy data */
	length = (*m_len) << blkbits;
	ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length);
	if (ret2) {
		if (!ret)
			ret = ret2;
		goto repair_branches;
	}
	/*
	 * Even in case of data=writeback it is reasonable to pin
	 * inode to transaction, to prevent unexpected data loss.
	 */
	ret2 = ext4_jbd2_inode_add_write(handle, orig_inode,
			((loff_t)orig_map->m_lblk) << blkbits, length);
	if (!ret)
		ret = ret2;
unlock:
	mext_folio_double_unlock(folio);
stop_handle:
	ext4_journal_stop(handle);
out:
	trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode,
				    mext->donor_lblk, orig_map->m_len, *m_len,
				    move_type, ret);
	return ret;

repair_branches:
	ret2 = 0;
	r_len = ext4_swap_extents(handle, donor_inode, orig_inode,
				  mext->donor_lblk, orig_map->m_lblk,
				  *m_len, 0, &ret2);
	if (ret2 || r_len != *m_len) {
		ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk),
				       EIO, "Unable to copy data block, data will be lost!");
		ret = -EIO;
	}
	*m_len = 0;
	goto unlock;
}

/*
 * Check the validity of the basic filesystem environment and the
 * inodes' support status.
 */
static int mext_check_validity(struct inode *orig_inode,
			       struct inode *donor_inode)
{
	struct super_block *sb = orig_inode->i_sb;

	/* origin and donor should be different inodes */
	if (orig_inode == donor_inode) {
		ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	/* origin and donor should belone to the same filesystem */
	if (orig_inode->i_sb != donor_inode->i_sb) {
		ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	/* Regular file check */
	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
		ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	if (ext4_has_feature_bigalloc(sb)) {
		ext4_msg(sb, KERN_ERR,
			 "Online defrag not supported with bigalloc");
		return -EOPNOTSUPP;
	}

	if (IS_DAX(orig_inode)) {
		ext4_msg(sb, KERN_ERR,
			 "Online defrag not supported with DAX");
		return -EOPNOTSUPP;
	}

	/*
	 * TODO: it's not obvious how to swap blocks for inodes with full
	 * journaling enabled.
	 */
	if (ext4_should_journal_data(orig_inode) ||
	    ext4_should_journal_data(donor_inode)) {
		ext4_msg(sb, KERN_ERR,
			 "Online defrag not supported with data journaling");
		return -EOPNOTSUPP;
	}

	if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
		ext4_msg(sb, KERN_ERR,
			 "Online defrag not supported for encrypted files");
		return -EOPNOTSUPP;
	}

	/* Ext4 move extent supports only extent based file */
	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) ||
	    !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
		ext4_msg(sb, KERN_ERR,
			 "Online defrag not supported for non-extent files");
		return -EOPNOTSUPP;
	}

	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
		ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) {
		ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EPERM;
	}

	/* Ext4 move extent does not support swap files */
	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
		ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -ETXTBSY;
	}

	if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) {
		ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EOPNOTSUPP;
	}

	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
		ext4_debug("ext4 move extent: File size is 0 byte\n");
		return -EINVAL;
	}

	return 0;
}

/*
 * Check the moving range of ext4_move_extents() whether the files can be
 * exchanged with each other, and adjust the length to fit within the file
 * size. Return 0 on success, or a negative error value on failure.
 */
static int mext_check_adjust_range(struct inode *orig_inode,
				   struct inode *donor_inode, __u64 orig_start,
				   __u64 donor_start, __u64 *len)
{
	__u64 orig_eof, donor_eof;

	/* Start offset should be same */
	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
		ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	if ((orig_start >= EXT_MAX_BLOCKS) ||
	    (donor_start >= EXT_MAX_BLOCKS) ||
	    (*len > EXT_MAX_BLOCKS) ||
	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
		ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n",
			   EXT_MAX_BLOCKS,
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode));
	donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode));
	if (orig_eof <= orig_start)
		*len = 0;
	else if (orig_eof < orig_start + *len - 1)
		*len = orig_eof - orig_start;
	if (donor_eof <= donor_start)
		*len = 0;
	else if (donor_eof < donor_start + *len - 1)
		*len = donor_eof - donor_start;
	if (!*len) {
		ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	return 0;
}

/**
 * ext4_move_extents - Exchange the specified range of a file
 *
 * @o_filp:		file structure of the original file
 * @d_filp:		file structure of the donor file
 * @orig_blk:		start offset in block for orig
 * @donor_blk:		start offset in block for donor
 * @len:		the number of blocks to be moved
 * @moved_len:		moved block length
 *
 * This function returns 0 and moved block length is set in moved_len
 * if succeed, otherwise returns error value.
 */
int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
		      __u64 donor_blk, __u64 len, __u64 *moved_len)
{
	struct inode *orig_inode = file_inode(o_filp);
	struct inode *donor_inode = file_inode(d_filp);
	struct mext_data mext;
	struct super_block *sb = orig_inode->i_sb;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	int retries = 0;
	u64 m_len;
	int ret;

	*moved_len = 0;

	/* Protect orig and donor inodes against a truncate */
	lock_two_nondirectories(orig_inode, donor_inode);

	ret = mext_check_validity(orig_inode, donor_inode);
	if (ret)
		goto out;

	/* Wait for all existing dio workers */
	inode_dio_wait(orig_inode);
	inode_dio_wait(donor_inode);

	/* Check and adjust the specified move_extent range. */
	ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk,
				      donor_blk, &len);
	if (ret)
		goto out;

	mext.orig_inode = orig_inode;
	mext.donor_inode = donor_inode;
	while (len) {
		mext.orig_map.m_lblk = orig_blk;
		mext.orig_map.m_len = len;
		mext.orig_map.m_flags = 0;
		mext.donor_lblk = donor_blk;

		ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0);
		if (ret < 0)
			goto out;

		/* Skip moving if it is a hole or a delalloc extent. */
		if (mext.orig_map.m_flags &
		    (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) {
			ret = mext_move_extent(&mext, &m_len);
			*moved_len += m_len;
			if (!ret)
				goto next;

			/* Move failed or partially failed. */
			if (m_len) {
				orig_blk += m_len;
				donor_blk += m_len;
				len -= m_len;
			}
			if (ret == -ESTALE)
				continue;
			if (ret == -ENOSPC &&
			    ext4_should_retry_alloc(sb, &retries))
				continue;
			if (ret == -EBUSY &&
			    sbi->s_journal && retries++ < 4 &&
			    jbd2_journal_force_commit_nested(sbi->s_journal))
				continue;

			goto out;
		}
next:
		orig_blk += mext.orig_map.m_len;
		donor_blk += mext.orig_map.m_len;
		len -= mext.orig_map.m_len;
		retries = 0;
	}

out:
	if (*moved_len) {
		ext4_discard_preallocations(orig_inode);
		ext4_discard_preallocations(donor_inode);
	}

	unlock_two_nondirectories(orig_inode, donor_inode);
	return ret;
}