erofs: add 48-bit block addressing on-disk support

The current 32-bit block addressing limits EROFS to a 16TiB maximum
volume size with 4KiB blocks.  However, several new use cases now
require larger capacity support:
 - Massive datasets for model training in order to boost random
   sampling performance for each epoch;

 - Object storage clients using EROFS direct passthrough.

This extends core on-disk structures to support 48-bit block addressing,
such as inodes, device slots, and inode chunks.

Additionally:
 - Expand superblock root NID to 8-byte `rootnid_8b` to enable full
   out-of-place update incremental builds;

 - Introduce `epoch` field in the superblock as well as add `mtime`
   field to 32-byte compact inodes for basic timestamp support.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20250310095459.2620647-4-hsiangkao@linux.alibaba.com
This commit is contained in:
Gao Xiang 2025-03-10 17:54:53 +08:00
parent 3422dfa903
commit 61ba89b579
5 changed files with 61 additions and 69 deletions

View File

@ -95,7 +95,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
map->m_flags = EROFS_MAP_MAPPED; map->m_flags = EROFS_MAP_MAPPED;
if (map->m_la < pos) { if (map->m_la < pos) {
map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
map->m_llen = pos - map->m_la; map->m_llen = pos - map->m_la;
} else { } else {
map->m_pa = erofs_iloc(inode) + vi->inode_isize + map->m_pa = erofs_iloc(inode) + vi->inode_isize +
@ -124,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits, map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
round_up(inode->i_size - map->m_la, blksz)); round_up(inode->i_size - map->m_la, blksz));
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
startblk = le32_to_cpu(idx->blkaddr); startblk = le32_to_cpu(idx->startblk_lo);
if (startblk != EROFS_NULL_ADDR) { if (startblk != EROFS_NULL_ADDR) {
map->m_deviceid = le16_to_cpu(idx->device_id) & map->m_deviceid = le16_to_cpu(idx->device_id) &
EROFS_SB(sb)->device_id_mask; EROFS_SB(sb)->device_id_mask;
@ -168,7 +168,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{ {
struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif; struct erofs_device_info *dif;
erofs_off_t startoff, length; erofs_off_t startoff;
int id; int id;
erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
@ -181,7 +181,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return -ENODEV; return -ENODEV;
} }
if (devs->flatdev) { if (devs->flatdev) {
map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); map->m_pa += erofs_pos(sb, dif->uniaddr);
up_read(&devs->rwsem); up_read(&devs->rwsem);
return 0; return 0;
} }
@ -190,13 +190,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
} else if (devs->extra_devices && !devs->flatdev) { } else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem); down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) { idr_for_each_entry(&devs->tree, dif, id) {
if (!dif->mapped_blkaddr) if (!dif->uniaddr)
continue; continue;
startoff = erofs_pos(sb, dif->mapped_blkaddr); startoff = erofs_pos(sb, dif->uniaddr);
length = erofs_pos(sb, dif->blocks);
if (map->m_pa >= startoff && if (map->m_pa >= startoff &&
map->m_pa < startoff + length) { map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
map->m_pa -= startoff; map->m_pa -= startoff;
erofs_fill_from_devinfo(map, sb, dif); erofs_fill_from_devinfo(map, sb, dif);
break; break;

View File

@ -30,25 +30,19 @@
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
#define EROFS_ALL_FEATURE_INCOMPAT \ #define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ ((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1)
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
EROFS_FEATURE_INCOMPAT_DEDUPE | \
EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
#define EROFS_SB_EXTSLOT_SIZE 16 #define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot { struct erofs_deviceslot {
u8 tag[64]; /* digest(sha256), etc. */ u8 tag[64]; /* digest(sha256), etc. */
__le32 blocks; /* total fs blocks of this device */ __le32 blocks_lo; /* total blocks count of this device */
__le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ __le32 uniaddr_lo; /* unified starting block of this device */
u8 reserved[56]; __le32 blocks_hi; /* total blocks count MSB */
__le16 uniaddr_hi; /* unified starting block MSB */
u8 reserved[50];
}; };
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
@ -59,13 +53,14 @@ struct erofs_super_block {
__le32 feature_compat; __le32 feature_compat;
__u8 blkszbits; /* filesystem block size in bit shift */ __u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
union {
__le16 root_nid; /* nid of root directory */ __le16 rootnid_2b; /* nid of root directory */
__le16 blocks_hi; /* (48BIT on) blocks count MSB */
} rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */ __le64 inos; /* total valid ino # (== f_files - f_favail) */
__le64 epoch; /* base seconds used for compact inodes */
__le64 build_time; /* compact inode time derivation */ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
__le32 build_time_nsec; /* compact inode time derivation in ns scale */ __le32 blocks_lo; /* blocks count LSB */
__le32 blocks; /* used for statfs */
__le32 meta_blkaddr; /* start block address of metadata area */ __le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */ __u8 uuid[16]; /* 128-bit uuid for volume */
@ -84,7 +79,10 @@ struct erofs_super_block {
__le32 xattr_prefix_start; /* start of long xattr prefixes */ __le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */ __le64 packed_nid; /* nid of the special packed inode */
__u8 xattr_filter_reserved; /* reserved for xattr name filter */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */
__u8 reserved2[23]; __u8 reserved[3];
__le32 build_time; /* seconds added to epoch for mkfs time */
__le64 rootnid_8b; /* (48BIT on) nid of root directory */
__u8 reserved2[8];
}; };
/* /*
@ -117,17 +115,16 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_BIT 0 #define EROFS_I_VERSION_BIT 0
#define EROFS_I_DATALAYOUT_BIT 1 #define EROFS_I_DATALAYOUT_BIT 1
#define EROFS_I_ALL_BIT 4 #define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
/* with chunk indexes or just a 4-byte blkaddr array */ /* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020 #define EROFS_CHUNK_FORMAT_INDEXES 0x0020
#define EROFS_CHUNK_FORMAT_48BIT 0x0040
#define EROFS_CHUNK_FORMAT_ALL \ #define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
(EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
/* 32-byte on-disk inode */ /* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0 #define EROFS_INODE_LAYOUT_COMPACT 0
@ -140,45 +137,40 @@ struct erofs_inode_chunk_info {
}; };
union erofs_inode_i_u { union erofs_inode_i_u {
/* total compressed blocks for compressed inodes */ __le32 blocks_lo; /* total blocks count (if compressed inodes) */
__le32 compressed_blocks; __le32 startblk_lo; /* starting block number (if flat inodes) */
__le32 rdev; /* device ID (if special inodes) */
/* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
__le32 rdev;
/* for chunk-based files, it contains the summary info */
struct erofs_inode_chunk_info c; struct erofs_inode_chunk_info c;
}; };
union erofs_inode_i_nb {
__le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
__le16 blocks_hi; /* total blocks count MSB */
__le16 startblk_hi; /* starting block number MSB */
};
/* 32-byte reduced form of an ondisk inode */ /* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact { struct erofs_inode_compact {
__le16 i_format; /* inode format hints */ __le16 i_format; /* inode format hints */
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount; __le16 i_xattr_icount;
__le16 i_mode; __le16 i_mode;
__le16 i_nlink; union erofs_inode_i_nb i_nb;
__le32 i_size; __le32 i_size;
__le32 i_reserved; __le32 i_mtime;
union erofs_inode_i_u i_u; union erofs_inode_i_u i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */ __le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid; __le16 i_uid;
__le16 i_gid; __le16 i_gid;
__le32 i_reserved2; __le32 i_reserved;
}; };
/* 64-byte complete form of an ondisk inode */ /* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended { struct erofs_inode_extended {
__le16 i_format; /* inode format hints */ __le16 i_format; /* inode format hints */
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount; __le16 i_xattr_icount;
__le16 i_mode; __le16 i_mode;
__le16 i_reserved; union erofs_inode_i_nb i_nb;
__le64 i_size; __le64 i_size;
union erofs_inode_i_u i_u; union erofs_inode_i_u i_u;
@ -248,6 +240,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
if (!i_xattr_icount) if (!i_xattr_icount)
return 0; return 0;
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
return sizeof(struct erofs_xattr_ibody_header) + return sizeof(struct erofs_xattr_ibody_header) +
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
} }
@ -266,11 +259,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */ /* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
/* 8-byte inode chunk indexes */ /* 8-byte inode chunk index */
struct erofs_inode_chunk_index { struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */ __le16 startblk_hi; /* starting block number MSB */
__le16 device_id; /* back-end storage id (with bits masked) */ __le16 device_id; /* back-end storage id (with bits masked) */
__le32 blkaddr; /* start block address of this inode chunk */ __le32 startblk_lo; /* starting block number of this chunk */
}; };
/* dirent sorts in alphabet order, thus we can do binary search */ /* dirent sorts in alphabet order, thus we can do binary search */

View File

@ -108,7 +108,7 @@ static int erofs_read_inode(struct inode *inode)
iu = dic->i_u; iu = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid)); i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink)); set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec); inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size); inode->i_size = le32_to_cpu(dic->i_size);
@ -129,7 +129,7 @@ static int erofs_read_inode(struct inode *inode)
case S_IFREG: case S_IFREG:
case S_IFDIR: case S_IFDIR:
case S_IFLNK: case S_IFLNK:
vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); vi->startblk = le32_to_cpu(iu.startblk_lo);
if(S_ISLNK(inode->i_mode)) { if(S_ISLNK(inode->i_mode)) {
err = erofs_fill_symlink(inode, ptr, ofs); err = erofs_fill_symlink(inode, ptr, ofs);
if (err) if (err)
@ -152,7 +152,7 @@ static int erofs_read_inode(struct inode *inode)
} }
if (erofs_inode_is_data_compressed(vi->datalayout)) if (erofs_inode_is_data_compressed(vi->datalayout))
inode->i_blocks = le32_to_cpu(iu.compressed_blocks) << inode->i_blocks = le32_to_cpu(iu.blocks_lo) <<
(sb->s_blocksize_bits - 9); (sb->s_blocksize_bits - 9);
else else
inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;

View File

@ -47,8 +47,8 @@ struct erofs_device_info {
struct dax_device *dax_dev; struct dax_device *dax_dev;
u64 dax_part_off; u64 dax_part_off;
u32 blocks; erofs_blk_t blocks;
u32 mapped_blkaddr; erofs_blk_t uniaddr;
}; };
enum { enum {
@ -252,7 +252,7 @@ struct erofs_inode {
unsigned int *xattr_shared_xattrs; unsigned int *xattr_shared_xattrs;
union { union {
erofs_blk_t raw_blkaddr; erofs_blk_t startblk;
struct { struct {
unsigned short chunkformat; unsigned short chunkformat;
unsigned char chunkbits; unsigned char chunkbits;

View File

@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
dif->file = file; dif->file = file;
} }
dif->blocks = le32_to_cpu(dis->blocks); dif->blocks = le32_to_cpu(dis->blocks_lo);
dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
sbi->total_blocks += dif->blocks; sbi->total_blocks += dif->blocks;
*pos += EROFS_DEVT_SLOT_SIZE; *pos += EROFS_DEVT_SLOT_SIZE;
return 0; return 0;
@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size); sbi->sb_size);
goto out; goto out;
} }
sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR #ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@ -308,12 +308,12 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif #endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid); sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
sbi->inos = le64_to_cpu(dsb->inos); sbi->inos = le64_to_cpu(dsb->inos);
sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time = le64_to_cpu(dsb->epoch);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); sbi->build_time_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));