NFSv4/flexfiles: Add support for striped layouts

Updates lseg creation path to parse and add striped layouts. Enable
support for striped layouts.

Limitations:

1. All mirrors must have the same number of stripes.

Signed-off-by: Jonathan Curley <jcurley@purestorage.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
This commit is contained in:
Jonathan Curley 2025-09-24 16:20:50 +00:00 committed by Anna Schumaker
parent 8a8e0f5566
commit 20b1d75fb8
2 changed files with 167 additions and 102 deletions

View File

@ -177,18 +177,19 @@ ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
#endif #endif
} }
static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
const struct nfs4_ff_layout_mirror *m2) const struct nfs4_ff_layout_ds_stripe *dss2)
{ {
int i, j; int i, j;
if (m1->dss[0].fh_versions_cnt != m2->dss[0].fh_versions_cnt) if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
return false; return false;
for (i = 0; i < m1->dss[0].fh_versions_cnt; i++) {
for (i = 0; i < dss1->fh_versions_cnt; i++) {
bool found_fh = false; bool found_fh = false;
for (j = 0; j < m2->dss[0].fh_versions_cnt; j++) { for (j = 0; j < dss2->fh_versions_cnt; j++) {
if (nfs_compare_fh(&m1->dss[0].fh_versions[i], if (nfs_compare_fh(&dss1->fh_versions[i],
&m2->dss[0].fh_versions[j]) == 0) { &dss2->fh_versions[j]) == 0) {
found_fh = true; found_fh = true;
break; break;
} }
@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return true; return true;
} }
static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
const struct nfs4_ff_layout_mirror *m2)
{
u32 dss_id;
if (m1->dss_count != m2->dss_count)
return false;
for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
return false;
return true;
}
static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
const struct nfs4_ff_layout_mirror *m2)
{
u32 dss_id;
if (m1->dss_count != m2->dss_count)
return false;
for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
if (memcmp(&m1->dss[dss_id].devid,
&m2->dss[dss_id].devid,
sizeof(m1->dss[dss_id].devid)) != 0)
return false;
return true;
}
static struct nfs4_ff_layout_mirror * static struct nfs4_ff_layout_mirror *
ff_layout_add_mirror(struct pnfs_layout_hdr *lo, ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
struct nfs4_ff_layout_mirror *mirror) struct nfs4_ff_layout_mirror *mirror)
@ -209,8 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
if (memcmp(&mirror->dss[0].devid, &pos->dss[0].devid, if (!ff_mirror_match_devid(mirror, pos))
sizeof(pos->dss[0].devid)) != 0)
continue; continue;
if (!ff_mirror_match_fh(mirror, pos)) if (!ff_mirror_match_fh(mirror, pos))
continue; continue;
@ -241,13 +273,15 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
{ {
struct nfs4_ff_layout_mirror *mirror; struct nfs4_ff_layout_mirror *mirror;
u32 dss_id;
mirror = kzalloc(sizeof(*mirror), gfp_flags); mirror = kzalloc(sizeof(*mirror), gfp_flags);
if (mirror != NULL) { if (mirror != NULL) {
spin_lock_init(&mirror->lock); spin_lock_init(&mirror->lock);
refcount_set(&mirror->ref, 1); refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors); INIT_LIST_HEAD(&mirror->mirrors);
nfs_localio_file_init(&mirror->dss[0].nfl); for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
nfs_localio_file_init(&mirror->dss[dss_id].nfl);
} }
return mirror; return mirror;
} }
@ -255,17 +289,19 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{ {
const struct cred *cred; const struct cred *cred;
int dss_id = 0; u32 dss_id;
ff_layout_remove_mirror(mirror); ff_layout_remove_mirror(mirror);
kfree(mirror->dss[dss_id].fh_versions); for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
nfs_close_local_fh(&mirror->dss[dss_id].nfl); kfree(mirror->dss[dss_id].fh_versions);
cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
put_cred(cred); put_cred(cred);
cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
put_cred(cred); put_cred(cred);
nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); nfs_close_local_fh(&mirror->dss[dss_id].nfl);
nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
}
kfree(mirror->dss); kfree(mirror->dss);
kfree(mirror); kfree(mirror);
@ -371,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
free_me); free_me);
} }
static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
{
u32 dss_id, sum = 0;
for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
sum += mirror->dss[dss_id].efficiency;
return sum;
}
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{ {
int i, j; int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++) for (j = i + 1; j < fls->mirror_array_cnt; j++)
if (fls->mirror_array[i]->dss[0].efficiency < if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
fls->mirror_array[j]->dss[0].efficiency) ff_mirror_efficiency_sum(fls->mirror_array[j]))
swap(fls->mirror_array[i], swap(fls->mirror_array[i],
fls->mirror_array[j]); fls->mirror_array[j]);
} }
@ -398,6 +444,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
u32 mirror_array_cnt; u32 mirror_array_cnt;
__be32 *p; __be32 *p;
int i, rc; int i, rc;
struct nfs4_ff_layout_ds_stripe *dss_info;
dprintk("--> %s\n", __func__); dprintk("--> %s\n", __func__);
scratch = folio_alloc(gfp_flags, 0); scratch = folio_alloc(gfp_flags, 0);
@ -440,17 +487,24 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
kuid_t uid; kuid_t uid;
kgid_t gid; kgid_t gid;
u32 fh_count, id; u32 fh_count, id;
int j, dss_id = 0; int j, dss_id;
rc = -EIO; rc = -EIO;
p = xdr_inline_decode(&stream, 4); p = xdr_inline_decode(&stream, 4);
if (!p) if (!p)
goto out_err_free; goto out_err_free;
dss_count = be32_to_cpup(p); // Ensure all mirrors have same stripe count.
if (dss_count == 0)
dss_count = be32_to_cpup(p);
else if (dss_count != be32_to_cpup(p))
goto out_err_free;
/* FIXME: allow for striping? */ if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
if (dss_count != 1) dss_count == 0)
goto out_err_free;
if (dss_count > 1 && stripe_unit == 0)
goto out_err_free; goto out_err_free;
fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@ -464,91 +518,100 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
gfp_flags); gfp_flags);
/* deviceid */ for (dss_id = 0; dss_id < dss_count; dss_id++) {
rc = decode_deviceid(&stream, &fls->mirror_array[i]->dss[dss_id].devid); dss_info = &fls->mirror_array[i]->dss[dss_id];
if (rc) dss_info->mirror = fls->mirror_array[i];
goto out_err_free;
/* efficiency */ /* deviceid */
rc = -EIO; rc = decode_deviceid(&stream, &dss_info->devid);
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
fls->mirror_array[i]->dss[dss_id].efficiency = be32_to_cpup(p);
/* stateid */
rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->dss[dss_id].stateid);
if (rc)
goto out_err_free;
/* fh */
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
fh_count = be32_to_cpup(p);
fls->mirror_array[i]->dss[dss_id].fh_versions =
kcalloc(fh_count, sizeof(struct nfs_fh),
gfp_flags);
if (fls->mirror_array[i]->dss[dss_id].fh_versions == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
for (j = 0; j < fh_count; j++) {
rc = decode_nfs_fh(&stream,
&fls->mirror_array[i]->dss[dss_id].fh_versions[j]);
if (rc) if (rc)
goto out_err_free; goto out_err_free;
/* efficiency */
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
dss_info->efficiency = be32_to_cpup(p);
/* stateid */
rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
if (rc)
goto out_err_free;
/* fh */
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
if (!p)
goto out_err_free;
fh_count = be32_to_cpup(p);
dss_info->fh_versions =
kcalloc(fh_count, sizeof(struct nfs_fh),
gfp_flags);
if (dss_info->fh_versions == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
for (j = 0; j < fh_count; j++) {
rc = decode_nfs_fh(&stream,
&dss_info->fh_versions[j]);
if (rc)
goto out_err_free;
}
dss_info->fh_versions_cnt = fh_count;
/* user */
rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
uid = make_kuid(&init_user_ns, id);
/* group */
rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
gid = make_kgid(&init_user_ns, id);
if (gfp_flags & __GFP_FS)
kcred = prepare_kernel_cred(&init_task);
else {
unsigned int nofs_flags = memalloc_nofs_save();
kcred = prepare_kernel_cred(&init_task);
memalloc_nofs_restore(nofs_flags);
}
rc = -ENOMEM;
if (!kcred)
goto out_err_free;
kcred->fsuid = uid;
kcred->fsgid = gid;
cred = RCU_INITIALIZER(kcred);
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(dss_info->ro_cred, cred);
else
rcu_assign_pointer(dss_info->rw_cred, cred);
} }
fls->mirror_array[i]->dss[dss_id].fh_versions_cnt = fh_count;
/* user */
rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
uid = make_kuid(&init_user_ns, id);
/* group */
rc = decode_name(&stream, &id);
if (rc)
goto out_err_free;
gid = make_kgid(&init_user_ns, id);
if (gfp_flags & __GFP_FS)
kcred = prepare_kernel_cred(&init_task);
else {
unsigned int nofs_flags = memalloc_nofs_save();
kcred = prepare_kernel_cred(&init_task);
memalloc_nofs_restore(nofs_flags);
}
rc = -ENOMEM;
if (!kcred)
goto out_err_free;
kcred->fsuid = uid;
kcred->fsgid = gid;
cred = RCU_INITIALIZER(kcred);
if (lgr->range.iomode == IOMODE_READ)
rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].ro_cred, cred);
else
rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].rw_cred, cred);
mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
if (mirror != fls->mirror_array[i]) { if (mirror != fls->mirror_array[i]) {
/* swap cred ptrs so free_mirror will clean up old */ for (dss_id = 0; dss_id < dss_count; dss_id++) {
if (lgr->range.iomode == IOMODE_READ) { dss_info = &fls->mirror_array[i]->dss[dss_id];
cred = xchg(&mirror->dss[dss_id].ro_cred, /* swap cred ptrs so free_mirror will clean up old */
fls->mirror_array[i]->dss[dss_id].ro_cred); if (lgr->range.iomode == IOMODE_READ) {
rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].ro_cred, cred); cred = xchg(&mirror->dss[dss_id].ro_cred,
} else { dss_info->ro_cred);
cred = xchg(&mirror->dss[dss_id].rw_cred, rcu_assign_pointer(dss_info->ro_cred, cred);
fls->mirror_array[i]->dss[dss_id].rw_cred); } else {
rcu_assign_pointer(fls->mirror_array[i]->dss[dss_id].rw_cred, cred); cred = xchg(&mirror->dss[dss_id].rw_cred,
dss_info->rw_cred);
rcu_assign_pointer(dss_info->rw_cred, cred);
}
} }
ff_layout_free_mirror(fls->mirror_array[i]); ff_layout_free_mirror(fls->mirror_array[i]);
fls->mirror_array[i] = mirror; fls->mirror_array[i] = mirror;

View File

@ -21,6 +21,8 @@
* due to network error etc. */ * due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
/* LAYOUTSTATS report interval in ms */ /* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
#define FF_LAYOUTSTATS_MAXDEV 4 #define FF_LAYOUTSTATS_MAXDEV 4